remove inst/doc files

2025-06-07 17:06:10 +02:00 · 2023-02-23 14:07:21 +01:00 · 2023-02-23 14:07:21 +01:00 · 62667ef139
commit 62667ef139
parent b26ca150a9
11 changed files with 0 additions and 4567 deletions
--- a/inst/doc/osem-history.R
+++ b/inst/doc/osem-history.R
@ -1,133 +0,0 @@
 ## ----setup, results='hide', message=FALSE, warning=FALSE-----------------
 # required packages:
 library(opensensmapr) # data download
 library(dplyr)        # data wrangling
 library(ggplot2)      # plotting
 library(lubridate)    # date arithmetic
 library(zoo)          # rollmean()
 ## ----download------------------------------------------------------------
 # if you want to see results for a specific subset of boxes,
 # just specify a filter such as grouptag='ifgi' here
 boxes = osem_boxes()
 ## ----exposure_counts, message=FALSE--------------------------------------
 exposure_counts = boxes %>%
  group_by(exposure) %>%
  mutate(count = row_number(createdAt))
 exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
 ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
  geom_line() +
  scale_colour_manual(values = exposure_colors) +
  xlab('Registration Date') + ylab('senseBox count')
 ## ----exposure_summary----------------------------------------------------
 exposure_counts %>%
  summarise(
    oldest = min(createdAt),
    newest = max(createdAt),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ## ----grouptag_counts, message=FALSE--------------------------------------
 grouptag_counts = boxes %>%
  group_by(grouptag) %>%
  # only include grouptags with 8 or more members
  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
  mutate(count = row_number(createdAt))
 # helper for sorting the grouptags by boxcount
 sortLvls = function(oldFactor, ascending = TRUE) {
  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
  factor(oldFactor, levels = lvls)
 }
 grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
 ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
  geom_line(aes(group = grouptag)) +
  xlab('Registration Date') + ylab('senseBox count')
 ## ----grouptag_summary----------------------------------------------------
 grouptag_counts %>%
  summarise(
    oldest = min(createdAt),
    newest = max(createdAt),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'----
 bins = 'week'
 mvavg_bins = 6
 growth = boxes %>%
  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'registered')
 ## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'----
 inactive = boxes %>%
  # remove boxes that were updated in the last two days,
  # b/c any box becomes inactive at some point by definition of updatedAt
  filter(updatedAt < now() - days(2)) %>%
  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'inactive')
 ## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------
 boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
 ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
  xlab('Time') + ylab(paste('rate per ', bins)) +
  scale_x_date(date_breaks="years", date_labels="%Y") +
  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
  geom_point(aes(y = count), size = 0.5) +
  # moving average, make first and last value NA (to ensure identical length of vectors)
  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
 ## ----exposure_duration, message=FALSE------------------------------------
 duration = boxes %>%
  group_by(exposure) %>%
  filter(!is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = exposure, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 ## ----grouptag_duration, message=FALSE------------------------------------
 duration = boxes %>%
  group_by(grouptag) %>%
  # only include grouptags with 8 or more members
  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = grouptag, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 duration %>%
  summarize(
    duration_avg = round(mean(duration)),
    duration_min = round(min(duration)),
    duration_max = round(max(duration)),
    oldest_box = round(max(difftime(now(), createdAt, units='days')))
  ) %>%
  arrange(desc(duration_avg))
 ## ----year_duration, message=FALSE----------------------------------------
 # NOTE: boxes older than 2016 missing due to missing updatedAt in database
 duration = boxes %>%
  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
  group_by(year) %>%
  filter(!is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
--- a/inst/doc/osem-history.Rmd
+++ b/inst/doc/osem-history.Rmd
@ -1,243 +0,0 @@
 ---
 title: "Visualising the History of openSenseMap.org"
 author: "Norwin Roosen"
 date: '`r Sys.Date()`'
 output:
  rmarkdown::html_vignette:
    df_print: kable
    fig_height: 5
    fig_width: 7
    toc: yes
  html_document:
    code_folding: hide
    df_print: kable
    theme: lumen
    toc: yes
    toc_float: yes
 vignette: >
  %\VignetteIndexEntry{Visualising the History of openSenseMap.org}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
 ---
 > This vignette serves as an example on data wrangling & visualization with
 `opensensmapr`, `dplyr` and `ggplot2`.
 ```{r setup, results='hide', message=FALSE, warning=FALSE}
 # required packages:
 library(opensensmapr) # data download
 library(dplyr)        # data wrangling
 library(ggplot2)      # plotting
 library(lubridate)    # date arithmetic
 library(zoo)          # rollmean()
 ```
 openSenseMap.org has grown quite a bit in the last years; it would be interesting
 to see how we got to the current `r osem_counts()$boxes` sensor stations,
 split up by various attributes of the boxes.
 While `opensensmapr` provides extensive methods of filtering boxes by attributes
 on the server, we do the filtering within R to save time and gain flexibility.
 So the first step is to retrieve *all the boxes*:
 ```{r download}
 # if you want to see results for a specific subset of boxes,
 # just specify a filter such as grouptag='ifgi' here
 boxes = osem_boxes()
 ```
 # Plot count of boxes by time {.tabset}
 By looking at the `createdAt` attribute of each box we know the exact time a box
 was registered.
 With this approach we have no information about boxes that were deleted in the
 meantime, but that's okay for now.
 ## ...and exposure
 ```{r exposure_counts, message=FALSE}
 exposure_counts = boxes %>%
  group_by(exposure) %>%
  mutate(count = row_number(createdAt))
 exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
 ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
  geom_line() +
  scale_colour_manual(values = exposure_colors) +
  xlab('Registration Date') + ylab('senseBox count')
 ```
 Outdoor boxes are growing *fast*!
 We can also see the introduction of `mobile` sensor "stations" in 2017. While
 mobile boxes are still few, we can expect a quick rise in 2018 once the new
 [senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU).
 Let's have a quick summary:
 ```{r exposure_summary}
 exposure_counts %>%
  summarise(
    oldest = min(createdAt),
    newest = max(createdAt),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ```
 ## ...and grouptag
 We can try to find out where the increases in growth came from, by analysing the 
 box count by grouptag.
 Caveats: Only a small subset of boxes has a grouptag, and we should assume
 that these groups are actually bigger. Also, we can see that grouptag naming is
 inconsistent (`Luftdaten`, `luftdaten.info`, ...)
 ```{r grouptag_counts, message=FALSE}
 grouptag_counts = boxes %>%
  group_by(grouptag) %>%
  # only include grouptags with 8 or more members
  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
  mutate(count = row_number(createdAt))
 # helper for sorting the grouptags by boxcount
 sortLvls = function(oldFactor, ascending = TRUE) {
  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
  factor(oldFactor, levels = lvls)
 }
 grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
 ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
  geom_line(aes(group = grouptag)) +
  xlab('Registration Date') + ylab('senseBox count')
 ```
 ```{r grouptag_summary}
 grouptag_counts %>%
  summarise(
    oldest = min(createdAt),
    newest = max(createdAt),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ```
 # Plot rate of growth and inactivity per week
 First we group the boxes by `createdAt` into bins of one week:
 ```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
 bins = 'week'
 mvavg_bins = 6
 growth = boxes %>%
  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'registered')
 ```
 We can do the same for `updatedAt`, which informs us about the last change to
 a box, including uploaded measurements.
 This method of determining inactive boxes is fairly inaccurate and should be
 considered an approximation, because we have no information about intermediate
 inactive phases.
 Also deleted boxes would probably have a big impact here.
 ```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
 inactive = boxes %>%
  # remove boxes that were updated in the last two days,
  # b/c any box becomes inactive at some point by definition of updatedAt
  filter(updatedAt < now() - days(2)) %>%
  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'inactive')
 ```
 Now we can combine both datasets for plotting:
 ```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
 boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
 ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
  xlab('Time') + ylab(paste('rate per ', bins)) +
  scale_x_date(date_breaks="years", date_labels="%Y") +
  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
  geom_point(aes(y = count), size = 0.5) +
  # moving average, make first and last value NA (to ensure identical length of vectors)
  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
 ```
 We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
 This was enabled by an integration of openSenseMap.org into the firmware of the
 air quality monitoring project [luftdaten.info](https://luftdaten.info).
 The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
 of the senseBox hardware, but I have no data on the exact time frames to verify.
 # Plot duration of boxes being active {.tabset}
 While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity
 of each box, and look at metrics by exposure and grouptag once more:
 ## ...by exposure
 ```{r exposure_duration, message=FALSE}
 duration = boxes %>%
  group_by(exposure) %>%
  filter(!is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = exposure, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 ```
 The time of activity averages at only `r round(mean(duration$duration))` days,
 though there are boxes with `r round(max(duration$duration))` days of activity,
 spanning a large chunk of openSenseMap's existence.
 ## ...by grouptag
 ```{r grouptag_duration, message=FALSE}
 duration = boxes %>%
  group_by(grouptag) %>%
  # only include grouptags with 8 or more members
  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = grouptag, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 duration %>%
  summarize(
    duration_avg = round(mean(duration)),
    duration_min = round(min(duration)),
    duration_max = round(max(duration)),
    oldest_box = round(max(difftime(now(), createdAt, units='days')))
  ) %>%
  arrange(desc(duration_avg))
 ```
 The time of activity averages at only `r round(mean(duration$duration))` days,
 though there are boxes with `r round(max(duration$duration))` days of activity,
 spanning a large chunk of openSenseMap's existence.
 ## ...by year of registration
 This is less useful, as older boxes are active for a longer time by definition.
 If you have an idea how to compensate for that, please send a [Pull Request][PR]!
 ```{r year_duration, message=FALSE}
 # NOTE: boxes older than 2016 missing due to missing updatedAt in database
 duration = boxes %>%
  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
  group_by(year) %>%
  filter(!is.na(updatedAt)) %>%
  mutate(duration = difftime(updatedAt, createdAt, units='days'))
 ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
 ```
 # More Visualisations
 Other visualisations come to mind, and are left as an exercise to the reader.
 If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
 * growth by phenomenon
 * growth by location -> (interactive) map
 * set inactive rate in relation to total box count
 * filter timespans with big dips in growth rate, and extrapolate the amount of
  senseBoxes that could be on the platform today, assuming there were no production issues ;)
 [PR]: https://github.com/sensebox/opensensmapr/pulls
--- a/inst/doc/osem-history.html
+++ b/inst/doc/osem-history.html
--- a/inst/doc/osem-history_revised.Rmd
+++ b/inst/doc/osem-history_revised.Rmd
@ -1,302 +0,0 @@
 ---
 title: "Visualising the Develpment of openSenseMap.org in 2022"
 author: "Jan Stenkamp"
 date: '`r Sys.Date()`'
 output:
  html_document:
    code_folding: hide
    df_print: kable
    theme: lumen
    toc: yes
    toc_float: yes
  rmarkdown::html_vignette:
    df_print: kable
    fig_height: 5
    fig_width: 7
    toc: yes
 vignette: >
  %\VignetteIndexEntry{Visualising the History of openSenseMap.org}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
 ---
 > This vignette serves as an example on data wrangling & visualization with
 `opensensmapr`, `dplyr` and `ggplot2`.
 ```{r setup, results='hide', message=FALSE, warning=FALSE}
 # required packages:
 # library(opensensmapr) # data download
 library(devtools)
 load_all(".")
 library(dplyr)        # data wrangling
 library(ggplot2)      # plotting
 library(lubridate)    # date arithmetic
 library(zoo)          # rollmean()
 ```
 openSenseMap.org has grown quite a bit in the last years; it would be interesting
 to see how we got to the current `r osem_counts()$boxes` sensor stations,
 split up by various attributes of the boxes.
 While `opensensmapr` provides extensive methods of filtering boxes by attributes
 on the server, we do the filtering within R to save time and gain flexibility.
 So the first step is to retrieve *all the boxes*. 
 ```{r download, results='hide', message=FALSE, warning=FALSE}
 # if you want to see results for a specific subset of boxes,
 # just specify a filter such as grouptag='ifgi' here
 boxes_all = osem_boxes()
 boxes = boxes_all
 ```
 # Introduction
 In the following we just want to have a look at the boxes created in 2022, so we filter for them. 
 ```{r}
 boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
 summary(boxes) -> summary.data.frame
 ```
 <!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
 <!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
 <!-- outdoors and have around 5 sensors each. -->
 <!-- The oldest station is from August 2016, while the latest station was registered a -->
 <!-- couple of minutes ago. -->
 Another feature of interest is the spatial distribution of the boxes: `plot()`
 can help us out here. This function requires a bunch of optional dependencies though.
 ```{r message=F, warning=F}
 if (!require('maps'))     install.packages('maps')
 if (!require('maptools')) install.packages('maptools')
 if (!require('rgeos'))    install.packages('rgeos')
 plot(boxes)
 ```
 But what do these sensor stations actually measure? Lets find out.
 `osem_phenomena()` gives us a named list of of the counts of each observed
 phenomenon for the given set of sensor stations:
 ```{r}
 phenoms = osem_phenomena(boxes)
 str(phenoms)
 ```
 Thats quite some noise there, with many phenomena being measured by a single
 sensor only, or many duplicated phenomena due to slightly different spellings.
 We should clean that up, but for now let's just filter out the noise and find
 those phenomena with high sensor numbers:
 ```{r}
 phenoms[phenoms > 50]
 ```
 # Plot count of boxes by time {.tabset}
 By looking at the `createdAt` attribute of each box we know the exact time a box
 was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
 With this approach we have no information about boxes that were deleted in the
 meantime, but that's okay for now.
 ## ...and exposure
 ```{r exposure_counts, message=FALSE}
 exposure_counts = boxes %>%
  group_by(exposure) %>%
  mutate(count = row_number(locationtimestamp))
 exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
 ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
  geom_line() +
  scale_colour_manual(values = exposure_colors) +
  xlab('Registration Date') + ylab('senseBox count')
 ```
 Outdoor boxes are growing *fast*!
 We can also see the introduction of `mobile` sensor "stations" in 2017. 
 Let's have a quick summary:
 ```{r exposure_summary}
 exposure_counts %>%
  summarise(
    oldest = min(locationtimestamp),
    newest = max(locationtimestamp),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ```
 ## ...and grouptag
 We can try to find out where the increases in growth came from, by analysing the 
 box count by grouptag.
 Caveats: Only a small subset of boxes has a grouptag, and we should assume
 that these groups are actually bigger. Also, we can see that grouptag naming is
 inconsistent (`Luftdaten`, `luftdaten.info`, ...)
 ```{r grouptag_counts, message=FALSE}
 grouptag_counts = boxes %>%
  group_by(grouptag) %>%
  # only include grouptags with 15 or more members
  filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
  mutate(count = row_number(locationtimestamp))
 # helper for sorting the grouptags by boxcount
 sortLvls = function(oldFactor, ascending = TRUE) {
  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
  factor(oldFactor, levels = lvls)
 }
 grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
 ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
  geom_line(aes(group = grouptag)) +
  xlab('Registration Date') + ylab('senseBox count')
 ```
 ```{r grouptag_summary}
 grouptag_counts %>%
  summarise(
    oldest = min(locationtimestamp),
    newest = max(locationtimestamp),
    count = max(count)
  ) %>%
  arrange(desc(count))
 ```
 # Plot rate of growth and inactivity per week
 First we group the boxes by `locationtimestamp` into bins of one week:
 ```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
 bins = 'week'
 mvavg_bins = 6
 growth = boxes %>%
  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'registered')
 ```
 We can do the same for `updatedAt`, which informs us about the last change to
 a box, including uploaded measurements. As a lot of boxes were "updated" by the database
 migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` 
 attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes 
 boxes which were created but never made a measurement.
 This method of determining inactive boxes is fairly inaccurate and should be
 considered an approximation, because we have no information about intermediate
 inactive phases.
 Also deleted boxes would probably have a big impact here.
 ```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
 inactive = boxes %>%
  # remove boxes that were updated in the last two days,
  # b/c any box becomes inactive at some point by definition of updatedAt
  filter(lastMeasurement < now() - days(2)) %>%
  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
  filter(as.Date(week) > as.Date("2021-12-31")) %>%
  group_by(week) %>%
  summarize(count = length(week)) %>%
  mutate(event = 'inactive')
 ```
 Now we can combine both datasets for plotting:
 ```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
 boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
 ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
  xlab('Time') + ylab(paste('rate per ', bins)) +
  scale_x_date(date_breaks="years", date_labels="%Y") +
  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
  geom_point(aes(y = count), size = 0.5) +
  # moving average, make first and last value NA (to ensure identical length of vectors)
  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
 ```
 And see in which weeks the most boxes become (in)active:
 ```{r table_mostregistrations}
 boxes_by_date %>%
  filter(count > 50) %>%
  arrange(desc(count))
 ```
 # Plot duration of boxes being active {.tabset}
 While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
 of each box, and look at metrics by exposure and grouptag once more:
 ## ...by exposure
 ```{r exposure_duration, message=FALSE}
 durations = boxes %>%
  group_by(exposure) %>%
  filter(!is.na(lastMeasurement)) %>%
  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
  filter(duration >= 0)
 ggplot(durations, aes(x = exposure, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 ```
 The time of activity averages at only `r round(mean(durations$duration))` days,
 though there are boxes with `r round(max(durations$duration))` days of activity,
 spanning a large chunk of openSenseMap's existence.
 ## ...by grouptag
 ```{r grouptag_duration, message=FALSE}
 durations = boxes %>%
  filter(!is.na(lastMeasurement)) %>%
  group_by(grouptag) %>%
  # only include grouptags with 20 or more members
  filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
  filter(duration >= 0)
 ggplot(durations, aes(x = grouptag, y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days')
 durations %>%
  summarize(
    duration_avg = round(mean(duration)),
    duration_min = round(min(duration)),
    duration_max = round(max(duration)),
    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
  ) %>%
  arrange(desc(duration_avg))
 ```
 The time of activity averages at only `r round(mean(durations$duration))` days,
 though there are boxes with `r round(max(durations$duration))` days of activity,
 spanning a large chunk of openSenseMap's existence.
 ## ...by year of registration
 This is less useful, as older boxes are active for a longer time by definition.
 If you have an idea how to compensate for that, please send a [Pull Request][PR]!
 ```{r year_duration, message=FALSE}
 # NOTE: boxes older than 2016 missing due to missing updatedAt in database
 duration = boxes %>%
  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
  group_by(year) %>%
  filter(!is.na(lastMeasurement)) %>%
  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
  filter(duration >= 0)
 ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
  geom_boxplot() +
  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
 ```
 # More Visualisations
 Other visualisations come to mind, and are left as an exercise to the reader.
 If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
 * growth by phenomenon
 * growth by location -> (interactive) map
 * set inactive rate in relation to total box count
 * filter timespans with big dips in growth rate, and extrapolate the amount of
  senseBoxes that could be on the platform today, assuming there were no production issues ;)
 [PR]: https://github.com/sensebox/opensensmapr/pulls
--- a/inst/doc/osem-history_revised.html
+++ b/inst/doc/osem-history_revised.html
--- a/inst/doc/osem-intro.R
+++ b/inst/doc/osem-intro.R
@ -1,73 +0,0 @@
 ## ----setup, include=FALSE------------------------------------------------
 knitr::opts_chunk$set(echo = TRUE)
 ## ----results = F---------------------------------------------------------
 library(magrittr)
 library(opensensmapr)
 all_sensors = osem_boxes()
 ## ------------------------------------------------------------------------
 summary(all_sensors)
 ## ----message=F, warning=F------------------------------------------------
 if (!require('maps'))     install.packages('maps')
 if (!require('maptools')) install.packages('maptools')
 if (!require('rgeos'))    install.packages('rgeos')
 plot(all_sensors)
 ## ------------------------------------------------------------------------
 phenoms = osem_phenomena(all_sensors)
 str(phenoms)
 ## ------------------------------------------------------------------------
 phenoms[phenoms > 20]
 ## ----results = F---------------------------------------------------------
 pm25_sensors = osem_boxes(
  exposure = 'outdoor',
  date = Sys.time(), # ±4 hours
  phenomenon = 'PM2.5'
 )
 ## ------------------------------------------------------------------------
 summary(pm25_sensors)
 plot(pm25_sensors)
 ## ------------------------------------------------------------------------
 library(sf)
 library(units)
 library(lubridate)
 library(dplyr)
 # construct a bounding box: 12 kilometers around Berlin
 berlin = st_point(c(13.4034, 52.5120)) %>%
  st_sfc(crs = 4326) %>%
  st_transform(3857) %>% # allow setting a buffer in meters
  st_buffer(set_units(12, km)) %>%
  st_transform(4326) %>% # the opensensemap expects WGS 84
  st_bbox()
 ## ----results = F---------------------------------------------------------
 pm25 = osem_measurements(
  berlin,
  phenomenon = 'PM2.5',
  from = now() - days(20), # defaults to 2 days
  to = now()
 )
 plot(pm25)
 ## ------------------------------------------------------------------------
 outliers = filter(pm25, value > 100)$sensorId
 bad_sensors = outliers[, drop = T] %>% levels()
 pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
 ## ------------------------------------------------------------------------
 st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
 ## ------------------------------------------------------------------------
 pm25 %>% filter(invalid == FALSE) %>% plot()
--- a/inst/doc/osem-intro.Rmd
+++ b/inst/doc/osem-intro.Rmd
@ -1,151 +0,0 @@
 ---
 title: "Exploring the openSenseMap Dataset"
 author: "Norwin Roosen"
 date: "`r Sys.Date()`"
 output:
  rmarkdown::html_vignette:
    fig_margin: 0
    fig_width: 6
    fig_height: 4
 vignette: >
  %\VignetteIndexEntry{Exploring the openSenseMap Dataset}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```
 This package provides data ingestion functions for almost any data stored on the
 open data platform for environemental sensordata <https://opensensemap.org>.
 Its main goals are to provide means for:
 - big data analysis of the measurements stored on the platform
 - sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
 ### Exploring the dataset
 Before we look at actual observations, lets get a grasp of the openSenseMap
 datasets' structure.
 ```{r results = F}
 library(magrittr)
 library(opensensmapr)
 all_sensors = osem_boxes()
 ```
 ```{r}
 summary(all_sensors)
 ```
 This gives a good overview already: As of writing this, there are more than 700
 sensor stations, of which ~50% are currently running. Most of them are placed
 outdoors and have around 5 sensors each.
 The oldest station is from May 2014, while the latest station was registered a
 couple of minutes ago.
 Another feature of interest is the spatial distribution of the boxes: `plot()`
 can help us out here. This function requires a bunch of optional dependencies though.
 ```{r message=F, warning=F}
 if (!require('maps'))     install.packages('maps')
 if (!require('maptools')) install.packages('maptools')
 if (!require('rgeos'))    install.packages('rgeos')
 plot(all_sensors)
 ```
 It seems we have to reduce our area of interest to Germany.
 But what do these sensor stations actually measure? Lets find out.
 `osem_phenomena()` gives us a named list of of the counts of each observed
 phenomenon for the given set of sensor stations:
 ```{r}
 phenoms = osem_phenomena(all_sensors)
 str(phenoms)
 ```
 Thats quite some noise there, with many phenomena being measured by a single
 sensor only, or many duplicated phenomena due to slightly different spellings.
 We should clean that up, but for now let's just filter out the noise and find
 those phenomena with high sensor numbers:
 ```{r}
 phenoms[phenoms > 20]
 ```
 Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
 interesting to analyze though. 
 We should check how many sensor stations provide useful data: We want only those
 boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
 measurements:
 ```{r results = F}
 pm25_sensors = osem_boxes(
  exposure = 'outdoor',
  date = Sys.time(), # ±4 hours
  phenomenon = 'PM2.5'
 )
 ```
 ```{r}
 summary(pm25_sensors)
 plot(pm25_sensors)
 ```
 Thats still more than 200 measuring stations, we can work with that.
 ### Analyzing sensor data
 Having analyzed the available data sources, let's finally get some measurements.
 We could call `osem_measurements(pm25_sensors)` now, however we are focussing on
 a restricted area of interest, the city of Berlin.
 Luckily we can get the measurements filtered by a bounding box:
 ```{r}
 library(sf)
 library(units)
 library(lubridate)
 library(dplyr)
 # construct a bounding box: 12 kilometers around Berlin
 berlin = st_point(c(13.4034, 52.5120)) %>%
  st_sfc(crs = 4326) %>%
  st_transform(3857) %>% # allow setting a buffer in meters
  st_buffer(set_units(12, km)) %>%
  st_transform(4326) %>% # the opensensemap expects WGS 84
  st_bbox()
 ```
 ```{r results = F}
 pm25 = osem_measurements(
  berlin,
  phenomenon = 'PM2.5',
  from = now() - days(20), # defaults to 2 days
  to = now()
 )
 plot(pm25)
 ```
 Now we can get started with actual spatiotemporal data analysis.
 First, lets mask the seemingly uncalibrated sensors:
 ```{r}
 outliers = filter(pm25, value > 100)$sensorId
 bad_sensors = outliers[, drop = T] %>% levels()
 pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
 ```
 Then plot the measuring locations, flagging the outliers:
 ```{r}
 st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
 ```
 Removing these sensors yields a nicer time series plot:
 ```{r}
 pm25 %>% filter(invalid == FALSE) %>% plot()
 ```
 Further analysis: comparison with LANUV data `TODO`
--- a/inst/doc/osem-intro.html
+++ b/inst/doc/osem-intro.html
--- a/inst/doc/osem-serialization.R
+++ b/inst/doc/osem-serialization.R
@ -1,51 +0,0 @@
 ## ----setup, results='hide'-----------------------------------------------
 # this vignette requires:
 library(opensensmapr)
 library(jsonlite)
 library(readr)
 ## ----cache---------------------------------------------------------------
 b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
 # the next identical request will hit the cache only!
 b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
 # requests without the cache parameter will still be performed normally
 b = osem_boxes(grouptag = 'ifgi')
 ## ----cachelisting--------------------------------------------------------
 list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
 ## ----cache_custom--------------------------------------------------------
 cacheDir = getwd() # current working directory
 b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
 # the next identical request will hit the cache only!
 b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
 ## ----clearcache----------------------------------------------------------
 osem_clear_cache() # clears default cache
 osem_clear_cache(getwd()) # clears a custom cache
 ## ----data, results='hide'------------------------------------------------
 # first get our example data:
 measurements = osem_measurements('Windrichtung')
 ## ----serialize_json------------------------------------------------------
 # serializing senseBoxes to JSON, and loading from file again:
 write(jsonlite::serializeJSON(measurements), 'measurements.json')
 measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
 class(measurements_from_file)
 ## ----serialize_attrs-----------------------------------------------------
 # note the toJSON call instead of serializeJSON
 write(jsonlite::toJSON(measurements), 'measurements_bad.json')
 measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
 class(measurements_without_attrs)
 measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
 class(measurements_with_attrs)
 ## ----cleanup, include=FALSE----------------------------------------------
 file.remove('measurements.json', 'measurements_bad.json')
--- a/inst/doc/osem-serialization.Rmd
+++ b/inst/doc/osem-serialization.Rmd
@ -1,106 +0,0 @@
 ---
 title: "Caching openSenseMap Data for Reproducibility"
 author: "Norwin Roosen"
 date: "`r Sys.Date()`"
 output: rmarkdown::html_vignette
 vignette: >
  %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
 ---
 It may be useful to download data from openSenseMap only once.
 For reproducible results, the data should be saved to disk, and reloaded at a
 later point.
 This avoids..
 - changed results for queries without date parameters,
 - unnecessary wait times,
 - risk of API changes / API unavailability,
 - stress on the openSenseMap-server.
 This vignette shows how to use this built in `opensensmapr` feature, and
 how to do it yourself in case you want to save to other data formats.
 ```{r setup, results='hide'}
 # this vignette requires:
 library(opensensmapr)
 library(jsonlite)
 library(readr)
 ```
 ## Using the opensensmapr Caching Feature
 All data retrieval functions of `opensensmapr` have a built in caching feature,
 which serializes an API response to disk.
 Subsequent identical requests will then return the serialized data instead of making
 another request.
 To use this feature, just add a path to a directory to the `cache` parameter:
 ```{r cache}
 b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
 # the next identical request will hit the cache only!
 b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
 # requests without the cache parameter will still be performed normally
 b = osem_boxes(grouptag = 'ifgi')
 ```
 Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
 ```{r cachelisting}
 list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
 ```
 You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
 ```{r cache_custom}
 cacheDir = getwd() # current working directory
 b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
 # the next identical request will hit the cache only!
 b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
 ```
 To get fresh results again, just call `osem_clear_cache()` for the respective cache:
 ```{r clearcache, results='hide'}
 osem_clear_cache()        # clears default cache
 osem_clear_cache(getwd()) # clears a custom cache
 ```
 ## Custom (De-) Serialization
 If you want to roll your own serialization method to support custom data formats,
 here's how:
 ```{r data, results='hide'}
 # first get our example data:
 measurements = osem_measurements('Windrichtung')
 ```
 If you are paranoid and worry about `.rds` files not being decodable anymore
 in the (distant) future, you could serialize to a plain text format such as JSON.
 This of course comes at the cost of storage space and performance.
 ```{r serialize_json}
 # serializing senseBoxes to JSON, and loading from file again:
 write(jsonlite::serializeJSON(measurements), 'measurements.json')
 measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
 class(measurements_from_file)
 ```
 This method also persists the R object metadata (classes, attributes).
 If you were to use a serialization method that can't persist object metadata, you
 could re-apply it with the following functions:
 ```{r serialize_attrs}
 # note the toJSON call instead of serializeJSON
 write(jsonlite::toJSON(measurements), 'measurements_bad.json')
 measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
 class(measurements_without_attrs)
 measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
 class(measurements_with_attrs)
 ```
 The same goes for boxes via `osem_as_sensebox()`.
 ```{r cleanup, include=FALSE}
 file.remove('measurements.json', 'measurements_bad.json')
 ```
--- a/inst/doc/osem-serialization.html
+++ b/inst/doc/osem-serialization.html