remove inst/doc files

2026-02-19 04:04:49 +01:00 · 2023-02-23 14:07:21 +01:00 · 2023-02-23 14:07:21 +01:00 · 62667ef139
commit 62667ef139
parent b26ca150a9
11 changed files with 0 additions and 4567 deletions
--- a/inst/doc/osem-history.R
+++ b/inst/doc/osem-history.R
@ -1,133 +0,0 @@
-## ----setup, results='hide', message=FALSE, warning=FALSE-----------------
-# required packages:
-library(opensensmapr) # data download
-library(dplyr)        # data wrangling
-library(ggplot2)      # plotting
-library(lubridate)    # date arithmetic
-library(zoo)          # rollmean()
-
-## ----download------------------------------------------------------------
-# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes = osem_boxes()
-
-## ----exposure_counts, message=FALSE--------------------------------------
-exposure_counts = boxes %>%
-  group_by(exposure) %>%
-  mutate(count = row_number(createdAt))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
-  geom_line() +
-  scale_colour_manual(values = exposure_colors) +
-  xlab('Registration Date') + ylab('senseBox count')
-
-## ----exposure_summary----------------------------------------------------
-exposure_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
-## ----grouptag_counts, message=FALSE--------------------------------------
-grouptag_counts = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
-  mutate(count = row_number(createdAt))
-
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
-  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
-  factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
-  geom_line(aes(group = grouptag)) +
-  xlab('Registration Date') + ylab('senseBox count')
-
-## ----grouptag_summary----------------------------------------------------
-grouptag_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
-## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'----
-bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
-  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'registered')
-
-## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'----
-inactive = boxes %>%
-  # remove boxes that were updated in the last two days,
-  # b/c any box becomes inactive at some point by definition of updatedAt
-  filter(updatedAt < now() - days(2)) %>%
-  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'inactive')
-
-## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------
-boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
-  xlab('Time') + ylab(paste('rate per ', bins)) +
-  scale_x_date(date_breaks="years", date_labels="%Y") +
-  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
-  geom_point(aes(y = count), size = 0.5) +
-  # moving average, make first and last value NA (to ensure identical length of vectors)
-  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-
-## ----exposure_duration, message=FALSE------------------------------------
-duration = boxes %>%
-  group_by(exposure) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = exposure, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-
-## ----grouptag_duration, message=FALSE------------------------------------
-duration = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-  
-ggplot(duration, aes(x = grouptag, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-
-duration %>%
-  summarize(
-    duration_avg = round(mean(duration)),
-    duration_min = round(min(duration)),
-    duration_max = round(max(duration)),
-    oldest_box = round(max(difftime(now(), createdAt, units='days')))
-  ) %>%
-  arrange(desc(duration_avg))
-
-## ----year_duration, message=FALSE----------------------------------------
-# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
-  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
-  group_by(year) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-
--- a/inst/doc/osem-history.Rmd
+++ b/inst/doc/osem-history.Rmd
@ -1,243 +0,0 @@
---
-title: "Visualising the History of openSenseMap.org"
-author: "Norwin Roosen"
-date: '`r Sys.Date()`'
-output:
-  rmarkdown::html_vignette:
-    df_print: kable
-    fig_height: 5
-    fig_width: 7
-    toc: yes
-  html_document:
-    code_folding: hide
-    df_print: kable
-    theme: lumen
-    toc: yes
-    toc_float: yes
-vignette: >
-  %\VignetteIndexEntry{Visualising the History of openSenseMap.org}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
---
-
-> This vignette serves as an example on data wrangling & visualization with
-`opensensmapr`, `dplyr` and `ggplot2`.
-
-```{r setup, results='hide', message=FALSE, warning=FALSE}
-# required packages:
-library(opensensmapr) # data download
-library(dplyr)        # data wrangling
-library(ggplot2)      # plotting
-library(lubridate)    # date arithmetic
-library(zoo)          # rollmean()
-```
-
-openSenseMap.org has grown quite a bit in the last years; it would be interesting
-to see how we got to the current `r osem_counts()$boxes` sensor stations,
-split up by various attributes of the boxes.
-
-While `opensensmapr` provides extensive methods of filtering boxes by attributes
-on the server, we do the filtering within R to save time and gain flexibility.
-So the first step is to retrieve *all the boxes*:
-
-```{r download}
-# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes = osem_boxes()
-```
-
-# Plot count of boxes by time {.tabset}
-By looking at the `createdAt` attribute of each box we know the exact time a box
-was registered.
-With this approach we have no information about boxes that were deleted in the
-meantime, but that's okay for now.
-
-## ...and exposure
-```{r exposure_counts, message=FALSE}
-exposure_counts = boxes %>%
-  group_by(exposure) %>%
-  mutate(count = row_number(createdAt))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
-  geom_line() +
-  scale_colour_manual(values = exposure_colors) +
-  xlab('Registration Date') + ylab('senseBox count')
-```
-
-Outdoor boxes are growing *fast*!
-We can also see the introduction of `mobile` sensor "stations" in 2017. While
-mobile boxes are still few, we can expect a quick rise in 2018 once the new
-[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU).
-
-Let's have a quick summary:
-```{r exposure_summary}
-exposure_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-```
-
-## ...and grouptag
-We can try to find out where the increases in growth came from, by analysing the 
-box count by grouptag.
-
-Caveats: Only a small subset of boxes has a grouptag, and we should assume
-that these groups are actually bigger. Also, we can see that grouptag naming is
-inconsistent (`Luftdaten`, `luftdaten.info`, ...)
-
-```{r grouptag_counts, message=FALSE}
-grouptag_counts = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
-  mutate(count = row_number(createdAt))
-
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
-  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
-  factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
-  geom_line(aes(group = grouptag)) +
-  xlab('Registration Date') + ylab('senseBox count')
-```
-
-```{r grouptag_summary}
-grouptag_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-```
-
-# Plot rate of growth and inactivity per week
-First we group the boxes by `createdAt` into bins of one week:
-```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
-bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
-  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'registered')
-```
-
-We can do the same for `updatedAt`, which informs us about the last change to
-a box, including uploaded measurements.
-This method of determining inactive boxes is fairly inaccurate and should be
-considered an approximation, because we have no information about intermediate
-inactive phases.
-Also deleted boxes would probably have a big impact here.
-```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
-inactive = boxes %>%
-  # remove boxes that were updated in the last two days,
-  # b/c any box becomes inactive at some point by definition of updatedAt
-  filter(updatedAt < now() - days(2)) %>%
-  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'inactive')
-```
-
-Now we can combine both datasets for plotting:
-```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
-boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
-  xlab('Time') + ylab(paste('rate per ', bins)) +
-  scale_x_date(date_breaks="years", date_labels="%Y") +
-  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
-  geom_point(aes(y = count), size = 0.5) +
-  # moving average, make first and last value NA (to ensure identical length of vectors)
-  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-```
-
-We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
-This was enabled by an integration of openSenseMap.org into the firmware of the
-air quality monitoring project [luftdaten.info](https://luftdaten.info).
-The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
-of the senseBox hardware, but I have no data on the exact time frames to verify.
-
-# Plot duration of boxes being active {.tabset}
-While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity
-of each box, and look at metrics by exposure and grouptag once more:
-
-## ...by exposure
-```{r exposure_duration, message=FALSE}
-duration = boxes %>%
-  group_by(exposure) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = exposure, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-```
-
-The time of activity averages at only `r round(mean(duration$duration))` days,
-though there are boxes with `r round(max(duration$duration))` days of activity,
-spanning a large chunk of openSenseMap's existence.
-
-## ...by grouptag
-```{r grouptag_duration, message=FALSE}
-duration = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-  
-ggplot(duration, aes(x = grouptag, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-
-duration %>%
-  summarize(
-    duration_avg = round(mean(duration)),
-    duration_min = round(min(duration)),
-    duration_max = round(max(duration)),
-    oldest_box = round(max(difftime(now(), createdAt, units='days')))
-  ) %>%
-  arrange(desc(duration_avg))
-```
-
-The time of activity averages at only `r round(mean(duration$duration))` days,
-though there are boxes with `r round(max(duration$duration))` days of activity,
-spanning a large chunk of openSenseMap's existence.
-
-## ...by year of registration
-This is less useful, as older boxes are active for a longer time by definition.
-If you have an idea how to compensate for that, please send a [Pull Request][PR]!
-
-```{r year_duration, message=FALSE}
-# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
-  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
-  group_by(year) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-```
-
-# More Visualisations
-Other visualisations come to mind, and are left as an exercise to the reader.
-If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
-
-* growth by phenomenon
-* growth by location -> (interactive) map
-* set inactive rate in relation to total box count
-* filter timespans with big dips in growth rate, and extrapolate the amount of
-  senseBoxes that could be on the platform today, assuming there were no production issues ;)
-
-[PR]: https://github.com/sensebox/opensensmapr/pulls
--- a/inst/doc/osem-history.html
+++ b/inst/doc/osem-history.html
--- a/inst/doc/osem-history_revised.Rmd
+++ b/inst/doc/osem-history_revised.Rmd
@ -1,302 +0,0 @@
---
-title: "Visualising the Develpment of openSenseMap.org in 2022"
-author: "Jan Stenkamp"
-date: '`r Sys.Date()`'
-output:
-  html_document:
-    code_folding: hide
-    df_print: kable
-    theme: lumen
-    toc: yes
-    toc_float: yes
-  rmarkdown::html_vignette:
-    df_print: kable
-    fig_height: 5
-    fig_width: 7
-    toc: yes
-vignette: >
-  %\VignetteIndexEntry{Visualising the History of openSenseMap.org}
-  %\VignetteEncoding{UTF-8}
-  %\VignetteEngine{knitr::rmarkdown}
---
-
-> This vignette serves as an example on data wrangling & visualization with
-`opensensmapr`, `dplyr` and `ggplot2`.
-
-```{r setup, results='hide', message=FALSE, warning=FALSE}
-# required packages:
-# library(opensensmapr) # data download
-library(devtools)
-load_all(".")
-library(dplyr)        # data wrangling
-library(ggplot2)      # plotting
-library(lubridate)    # date arithmetic
-library(zoo)          # rollmean()
-```
-
-openSenseMap.org has grown quite a bit in the last years; it would be interesting
-to see how we got to the current `r osem_counts()$boxes` sensor stations,
-split up by various attributes of the boxes.
-
-While `opensensmapr` provides extensive methods of filtering boxes by attributes
-on the server, we do the filtering within R to save time and gain flexibility.
-
-
-So the first step is to retrieve *all the boxes*. 
-
-```{r download, results='hide', message=FALSE, warning=FALSE}
-# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes_all = osem_boxes()
-boxes = boxes_all
-```
-# Introduction
-In the following we just want to have a look at the boxes created in 2022, so we filter for them. 
-
-```{r}
-boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
-summary(boxes) -> summary.data.frame
-```
-
-<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
-<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
-<!-- outdoors and have around 5 sensors each. -->
-<!-- The oldest station is from August 2016, while the latest station was registered a -->
-<!-- couple of minutes ago. -->
-
-Another feature of interest is the spatial distribution of the boxes: `plot()`
-can help us out here. This function requires a bunch of optional dependencies though.
-
-```{r message=F, warning=F}
-if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(boxes)
-```
-
-But what do these sensor stations actually measure? Lets find out.
-`osem_phenomena()` gives us a named list of of the counts of each observed
-phenomenon for the given set of sensor stations:
-
-```{r}
-phenoms = osem_phenomena(boxes)
-str(phenoms)
-```
-
-Thats quite some noise there, with many phenomena being measured by a single
-sensor only, or many duplicated phenomena due to slightly different spellings.
-We should clean that up, but for now let's just filter out the noise and find
-those phenomena with high sensor numbers:
-
-```{r}
-phenoms[phenoms > 50]
-```
-
-
-# Plot count of boxes by time {.tabset}
-By looking at the `createdAt` attribute of each box we know the exact time a box
-was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
-
-With this approach we have no information about boxes that were deleted in the
-meantime, but that's okay for now.
-
-## ...and exposure
-```{r exposure_counts, message=FALSE}
-exposure_counts = boxes %>%
-  group_by(exposure) %>%
-  mutate(count = row_number(locationtimestamp))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
-  geom_line() +
-  scale_colour_manual(values = exposure_colors) +
-  xlab('Registration Date') + ylab('senseBox count')
-```
-
-Outdoor boxes are growing *fast*!
-We can also see the introduction of `mobile` sensor "stations" in 2017. 
-
-Let's have a quick summary:
-```{r exposure_summary}
-exposure_counts %>%
-  summarise(
-    oldest = min(locationtimestamp),
-    newest = max(locationtimestamp),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-```
-
-## ...and grouptag
-We can try to find out where the increases in growth came from, by analysing the 
-box count by grouptag.
-
-Caveats: Only a small subset of boxes has a grouptag, and we should assume
-that these groups are actually bigger. Also, we can see that grouptag naming is
-inconsistent (`Luftdaten`, `luftdaten.info`, ...)
-
-```{r grouptag_counts, message=FALSE}
-grouptag_counts = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 15 or more members
-  filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
-  mutate(count = row_number(locationtimestamp))
-
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
-  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
-  factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
-  geom_line(aes(group = grouptag)) +
-  xlab('Registration Date') + ylab('senseBox count')
-```
-
-```{r grouptag_summary}
-grouptag_counts %>%
-  summarise(
-    oldest = min(locationtimestamp),
-    newest = max(locationtimestamp),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-```
-
-# Plot rate of growth and inactivity per week
-First we group the boxes by `locationtimestamp` into bins of one week:
-```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
-bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
-  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'registered')
-```
-
-We can do the same for `updatedAt`, which informs us about the last change to
-a box, including uploaded measurements. As a lot of boxes were "updated" by the database
-migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` 
-attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes 
-boxes which were created but never made a measurement.
-
-This method of determining inactive boxes is fairly inaccurate and should be
-considered an approximation, because we have no information about intermediate
-inactive phases.
-Also deleted boxes would probably have a big impact here.
-```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
-inactive = boxes %>%
-  # remove boxes that were updated in the last two days,
-  # b/c any box becomes inactive at some point by definition of updatedAt
-  filter(lastMeasurement < now() - days(2)) %>%
-  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
-  filter(as.Date(week) > as.Date("2021-12-31")) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'inactive')
-```
-
-Now we can combine both datasets for plotting:
-```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
-boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
-  xlab('Time') + ylab(paste('rate per ', bins)) +
-  scale_x_date(date_breaks="years", date_labels="%Y") +
-  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
-  geom_point(aes(y = count), size = 0.5) +
-  # moving average, make first and last value NA (to ensure identical length of vectors)
-  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-```
-
-And see in which weeks the most boxes become (in)active:
-```{r table_mostregistrations}
-boxes_by_date %>%
-  filter(count > 50) %>%
-  arrange(desc(count))
-```
-
-# Plot duration of boxes being active {.tabset}
-While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
-of each box, and look at metrics by exposure and grouptag once more:
-
-## ...by exposure
-```{r exposure_duration, message=FALSE}
-durations = boxes %>%
-  group_by(exposure) %>%
-  filter(!is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-
-ggplot(durations, aes(x = exposure, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-```
-
-The time of activity averages at only `r round(mean(durations$duration))` days,
-though there are boxes with `r round(max(durations$duration))` days of activity,
-spanning a large chunk of openSenseMap's existence.
-
-## ...by grouptag
-```{r grouptag_duration, message=FALSE}
-durations = boxes %>%
-  filter(!is.na(lastMeasurement)) %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 20 or more members
-  filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-  
-ggplot(durations, aes(x = grouptag, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-
-durations %>%
-  summarize(
-    duration_avg = round(mean(duration)),
-    duration_min = round(min(duration)),
-    duration_max = round(max(duration)),
-    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
-  ) %>%
-  arrange(desc(duration_avg))
-```
-
-The time of activity averages at only `r round(mean(durations$duration))` days,
-though there are boxes with `r round(max(durations$duration))` days of activity,
-spanning a large chunk of openSenseMap's existence.
-
-## ...by year of registration
-This is less useful, as older boxes are active for a longer time by definition.
-If you have an idea how to compensate for that, please send a [Pull Request][PR]!
-
-```{r year_duration, message=FALSE}
-# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
-  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
-  group_by(year) %>%
-  filter(!is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-```
-
-# More Visualisations
-Other visualisations come to mind, and are left as an exercise to the reader.
-If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
-
-* growth by phenomenon
-* growth by location -> (interactive) map
-* set inactive rate in relation to total box count
-* filter timespans with big dips in growth rate, and extrapolate the amount of
-  senseBoxes that could be on the platform today, assuming there were no production issues ;)
-
-[PR]: https://github.com/sensebox/opensensmapr/pulls
-
-
--- a/inst/doc/osem-history_revised.html
+++ b/inst/doc/osem-history_revised.html
--- a/inst/doc/osem-intro.R
+++ b/inst/doc/osem-intro.R
@ -1,73 +0,0 @@
-## ----setup, include=FALSE------------------------------------------------
-knitr::opts_chunk$set(echo = TRUE)
-
-## ----results = F---------------------------------------------------------
-library(magrittr)
-library(opensensmapr)
-
-all_sensors = osem_boxes()
-
-## ------------------------------------------------------------------------
-summary(all_sensors)
-
-## ----message=F, warning=F------------------------------------------------
-if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(all_sensors)
-
-## ------------------------------------------------------------------------
-phenoms = osem_phenomena(all_sensors)
-str(phenoms)
-
-## ------------------------------------------------------------------------
-phenoms[phenoms > 20]
-
-## ----results = F---------------------------------------------------------
-pm25_sensors = osem_boxes(
-  exposure = 'outdoor',
-  date = Sys.time(), # ±4 hours
-  phenomenon = 'PM2.5'
-)
-
-## ------------------------------------------------------------------------
-summary(pm25_sensors)
-plot(pm25_sensors)
-
-## ------------------------------------------------------------------------
-library(sf)
-library(units)
-library(lubridate)
-library(dplyr)
-
-# construct a bounding box: 12 kilometers around Berlin
-berlin = st_point(c(13.4034, 52.5120)) %>%
-  st_sfc(crs = 4326) %>%
-  st_transform(3857) %>% # allow setting a buffer in meters
-  st_buffer(set_units(12, km)) %>%
-  st_transform(4326) %>% # the opensensemap expects WGS 84
-  st_bbox()
-
-## ----results = F---------------------------------------------------------
-pm25 = osem_measurements(
-  berlin,
-  phenomenon = 'PM2.5',
-  from = now() - days(20), # defaults to 2 days
-  to = now()
-)
-
-plot(pm25)
-
-## ------------------------------------------------------------------------
-outliers = filter(pm25, value > 100)$sensorId
-bad_sensors = outliers[, drop = T] %>% levels()
-
-pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
-
-## ------------------------------------------------------------------------
-st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
-
-## ------------------------------------------------------------------------
-pm25 %>% filter(invalid == FALSE) %>% plot()
-
--- a/inst/doc/osem-intro.Rmd
+++ b/inst/doc/osem-intro.Rmd
@ -1,151 +0,0 @@
---
-title: "Exploring the openSenseMap Dataset"
-author: "Norwin Roosen"
-date: "`r Sys.Date()`"
-output:
-  rmarkdown::html_vignette:
-    fig_margin: 0
-    fig_width: 6
-    fig_height: 4
-vignette: >
-  %\VignetteIndexEntry{Exploring the openSenseMap Dataset}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
---
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-This package provides data ingestion functions for almost any data stored on the
-open data platform for environemental sensordata <https://opensensemap.org>.
-Its main goals are to provide means for:
-
- big data analysis of the measurements stored on the platform
- sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
-
-### Exploring the dataset
-Before we look at actual observations, lets get a grasp of the openSenseMap
-datasets' structure.
-
-```{r results = F}
-library(magrittr)
-library(opensensmapr)
-
-all_sensors = osem_boxes()
-```
-```{r}
-summary(all_sensors)
-```
-
-This gives a good overview already: As of writing this, there are more than 700
-sensor stations, of which ~50% are currently running. Most of them are placed
-outdoors and have around 5 sensors each.
-The oldest station is from May 2014, while the latest station was registered a
-couple of minutes ago.
-
-Another feature of interest is the spatial distribution of the boxes: `plot()`
-can help us out here. This function requires a bunch of optional dependencies though.
-
-```{r message=F, warning=F}
-if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(all_sensors)
-```
-
-It seems we have to reduce our area of interest to Germany.
-
-But what do these sensor stations actually measure? Lets find out.
-`osem_phenomena()` gives us a named list of of the counts of each observed
-phenomenon for the given set of sensor stations:
-
-```{r}
-phenoms = osem_phenomena(all_sensors)
-str(phenoms)
-```
-
-Thats quite some noise there, with many phenomena being measured by a single
-sensor only, or many duplicated phenomena due to slightly different spellings.
-We should clean that up, but for now let's just filter out the noise and find
-those phenomena with high sensor numbers:
-
-```{r}
-phenoms[phenoms > 20]
-```
-
-Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
-interesting to analyze though. 
-We should check how many sensor stations provide useful data: We want only those
-boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
-measurements:
-
-```{r results = F}
-pm25_sensors = osem_boxes(
-  exposure = 'outdoor',
-  date = Sys.time(), # ±4 hours
-  phenomenon = 'PM2.5'
-)
-```
-```{r}
-summary(pm25_sensors)
-plot(pm25_sensors)
-```
-
-Thats still more than 200 measuring stations, we can work with that.
-
-### Analyzing sensor data
-Having analyzed the available data sources, let's finally get some measurements.
-We could call `osem_measurements(pm25_sensors)` now, however we are focussing on
-a restricted area of interest, the city of Berlin.
-Luckily we can get the measurements filtered by a bounding box:
-
-```{r}
-library(sf)
-library(units)
-library(lubridate)
-library(dplyr)
-
-# construct a bounding box: 12 kilometers around Berlin
-berlin = st_point(c(13.4034, 52.5120)) %>%
-  st_sfc(crs = 4326) %>%
-  st_transform(3857) %>% # allow setting a buffer in meters
-  st_buffer(set_units(12, km)) %>%
-  st_transform(4326) %>% # the opensensemap expects WGS 84
-  st_bbox()
-```
-```{r results = F}
-pm25 = osem_measurements(
-  berlin,
-  phenomenon = 'PM2.5',
-  from = now() - days(20), # defaults to 2 days
-  to = now()
-)
-
-plot(pm25)
-```
-
-Now we can get started with actual spatiotemporal data analysis.
-First, lets mask the seemingly uncalibrated sensors:
-
-```{r}
-outliers = filter(pm25, value > 100)$sensorId
-bad_sensors = outliers[, drop = T] %>% levels()
-
-pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
-```
-
-Then plot the measuring locations, flagging the outliers:
-
-```{r}
-st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
-```
-
-Removing these sensors yields a nicer time series plot:
-
-```{r}
-pm25 %>% filter(invalid == FALSE) %>% plot()
-```
-
-Further analysis: comparison with LANUV data `TODO`
--- a/inst/doc/osem-intro.html
+++ b/inst/doc/osem-intro.html
--- a/inst/doc/osem-serialization.R
+++ b/inst/doc/osem-serialization.R
@ -1,51 +0,0 @@
-## ----setup, results='hide'-----------------------------------------------
-# this vignette requires:
-library(opensensmapr)
-library(jsonlite)
-library(readr)
-
-## ----cache---------------------------------------------------------------
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# requests without the cache parameter will still be performed normally
-b = osem_boxes(grouptag = 'ifgi')
-
-## ----cachelisting--------------------------------------------------------
-list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
-
-## ----cache_custom--------------------------------------------------------
-cacheDir = getwd() # current working directory
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-
-## ----clearcache----------------------------------------------------------
-osem_clear_cache() # clears default cache
-osem_clear_cache(getwd()) # clears a custom cache
-
-## ----data, results='hide'------------------------------------------------
-# first get our example data:
-measurements = osem_measurements('Windrichtung')
-
-## ----serialize_json------------------------------------------------------
-# serializing senseBoxes to JSON, and loading from file again:
-write(jsonlite::serializeJSON(measurements), 'measurements.json')
-measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
-class(measurements_from_file)
-
-## ----serialize_attrs-----------------------------------------------------
-# note the toJSON call instead of serializeJSON
-write(jsonlite::toJSON(measurements), 'measurements_bad.json')
-measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
-class(measurements_without_attrs)
-
-measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
-class(measurements_with_attrs)
-
-## ----cleanup, include=FALSE----------------------------------------------
-file.remove('measurements.json', 'measurements_bad.json')
-
--- a/inst/doc/osem-serialization.Rmd
+++ b/inst/doc/osem-serialization.Rmd
@ -1,106 +0,0 @@
---
-title: "Caching openSenseMap Data for Reproducibility"
-author: "Norwin Roosen"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
---
-
-It may be useful to download data from openSenseMap only once.
-For reproducible results, the data should be saved to disk, and reloaded at a
-later point.
-
-This avoids..
-
- changed results for queries without date parameters,
- unnecessary wait times,
- risk of API changes / API unavailability,
- stress on the openSenseMap-server.
-
-This vignette shows how to use this built in `opensensmapr` feature, and
-how to do it yourself in case you want to save to other data formats.
-
-```{r setup, results='hide'}
-# this vignette requires:
-library(opensensmapr)
-library(jsonlite)
-library(readr)
-```
-
-## Using the opensensmapr Caching Feature
-All data retrieval functions of `opensensmapr` have a built in caching feature,
-which serializes an API response to disk.
-Subsequent identical requests will then return the serialized data instead of making
-another request.
-
-To use this feature, just add a path to a directory to the `cache` parameter:
-```{r cache}
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# requests without the cache parameter will still be performed normally
-b = osem_boxes(grouptag = 'ifgi')
-```
-
-Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
-```{r cachelisting}
-list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
-```
-
-You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
-```{r cache_custom}
-cacheDir = getwd() # current working directory
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-```
-
-To get fresh results again, just call `osem_clear_cache()` for the respective cache:
-```{r clearcache, results='hide'}
-osem_clear_cache()        # clears default cache
-osem_clear_cache(getwd()) # clears a custom cache
-```
-
-## Custom (De-) Serialization
-If you want to roll your own serialization method to support custom data formats,
-here's how:
-
-```{r data, results='hide'}
-# first get our example data:
-measurements = osem_measurements('Windrichtung')
-```
-
-If you are paranoid and worry about `.rds` files not being decodable anymore
-in the (distant) future, you could serialize to a plain text format such as JSON.
-This of course comes at the cost of storage space and performance.
-```{r serialize_json}
-# serializing senseBoxes to JSON, and loading from file again:
-write(jsonlite::serializeJSON(measurements), 'measurements.json')
-measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
-class(measurements_from_file)
-```
-
-This method also persists the R object metadata (classes, attributes).
-If you were to use a serialization method that can't persist object metadata, you
-could re-apply it with the following functions:
-
-```{r serialize_attrs}
-# note the toJSON call instead of serializeJSON
-write(jsonlite::toJSON(measurements), 'measurements_bad.json')
-measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
-class(measurements_without_attrs)
-
-measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
-class(measurements_with_attrs)
-```
-The same goes for boxes via `osem_as_sensebox()`.
-
-```{r cleanup, include=FALSE}
-file.remove('measurements.json', 'measurements_bad.json')
-```
--- a/inst/doc/osem-serialization.html
+++ b/inst/doc/osem-serialization.html