add inst/doc

2025-10-17 21:33:52 +02:00 · 2023-02-23 15:12:46 +01:00 · 2023-02-23 15:12:46 +01:00 · 37d4dde1d6
commit 37d4dde1d6
parent 62667ef139
12 changed files with 6822 additions and 0 deletions
--- a/inst/doc/osem-history.R
+++ b/inst/doc/osem-history.R
@ -0,0 +1,133 @@
+## ----setup, results='hide', message=FALSE, warning=FALSE----------------------
+# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+
+## ----download-----------------------------------------------------------------
+# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes = osem_boxes()
+
+## ----exposure_counts, message=FALSE-------------------------------------------
+exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(createdAt))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+
+## ----exposure_summary---------------------------------------------------------
+exposure_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+## ----grouptag_counts, message=FALSE-------------------------------------------
+grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
+  mutate(count = row_number(createdAt))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+
+## ----grouptag_summary---------------------------------------------------------
+grouptag_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------
+bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+
+## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'--------
+inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(updatedAt < now() - days(2)) %>%
+  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+
+## ----growthrate, warning=FALSE, message=FALSE, results='hide'-----------------
+boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+
+## ----exposure_duration, message=FALSE-----------------------------------------
+duration = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+## ----grouptag_duration, message=FALSE-----------------------------------------
+duration = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+  
+ggplot(duration, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+duration %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), createdAt, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+
+## ----year_duration, message=FALSE---------------------------------------------
+# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+
--- a/inst/doc/osem-history.Rmd
+++ b/inst/doc/osem-history.Rmd
@ -0,0 +1,243 @@
+---
+title: "Visualising the History of openSenseMap.org"
+author: "Norwin Roosen"
+date: '`r Sys.Date()`'
+output:
+  rmarkdown::html_vignette:
+    df_print: kable
+    fig_height: 5
+    fig_width: 7
+    toc: yes
+  html_document:
+    code_folding: hide
+    df_print: kable
+    theme: lumen
+    toc: yes
+    toc_float: yes
+vignette: >
+  %\VignetteIndexEntry{Visualising the History of openSenseMap.org}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+> This vignette serves as an example on data wrangling & visualization with
+`opensensmapr`, `dplyr` and `ggplot2`.
+
+```{r setup, results='hide', message=FALSE, warning=FALSE}
+# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+```
+
+openSenseMap.org has grown quite a bit in the last years; it would be interesting
+to see how we got to the current `r osem_counts()$boxes` sensor stations,
+split up by various attributes of the boxes.
+
+While `opensensmapr` provides extensive methods of filtering boxes by attributes
+on the server, we do the filtering within R to save time and gain flexibility.
+So the first step is to retrieve *all the boxes*:
+
+```{r download}
+# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes = osem_boxes()
+```
+
+# Plot count of boxes by time {.tabset}
+By looking at the `createdAt` attribute of each box we know the exact time a box
+was registered.
+With this approach we have no information about boxes that were deleted in the
+meantime, but that's okay for now.
+
+## ...and exposure
+```{r exposure_counts, message=FALSE}
+exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(createdAt))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+```
+
+Outdoor boxes are growing *fast*!
+We can also see the introduction of `mobile` sensor "stations" in 2017. While
+mobile boxes are still few, we can expect a quick rise in 2018 once the new
+senseBox MCU with GPS support is released.
+
+Let's have a quick summary:
+```{r exposure_summary}
+exposure_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+```
+
+## ...and grouptag
+We can try to find out where the increases in growth came from, by analysing the 
+box count by grouptag.
+
+Caveats: Only a small subset of boxes has a grouptag, and we should assume
+that these groups are actually bigger. Also, we can see that grouptag naming is
+inconsistent (`Luftdaten`, `luftdaten.info`, ...)
+
+```{r grouptag_counts, message=FALSE}
+grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
+  mutate(count = row_number(createdAt))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+```
+
+```{r grouptag_summary}
+grouptag_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+```
+
+# Plot rate of growth and inactivity per week
+First we group the boxes by `createdAt` into bins of one week:
+```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
+bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+```
+
+We can do the same for `updatedAt`, which informs us about the last change to
+a box, including uploaded measurements.
+This method of determining inactive boxes is fairly inaccurate and should be
+considered an approximation, because we have no information about intermediate
+inactive phases.
+Also deleted boxes would probably have a big impact here.
+```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
+inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(updatedAt < now() - days(2)) %>%
+  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+```
+
+Now we can combine both datasets for plotting:
+```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
+boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+```
+
+We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
+This was enabled by an integration of openSenseMap.org into the firmware of the
+air quality monitoring project [luftdaten.info](https://sensor.community/de/).
+The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
+of the senseBox hardware, but I have no data on the exact time frames to verify.
+
+# Plot duration of boxes being active {.tabset}
+While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity
+of each box, and look at metrics by exposure and grouptag once more:
+
+## ...by exposure
+```{r exposure_duration, message=FALSE}
+duration = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+```
+
+The time of activity averages at only `r round(mean(duration$duration))` days,
+though there are boxes with `r round(max(duration$duration))` days of activity,
+spanning a large chunk of openSenseMap's existence.
+
+## ...by grouptag
+```{r grouptag_duration, message=FALSE}
+duration = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+  
+ggplot(duration, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+duration %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), createdAt, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+```
+
+The time of activity averages at only `r round(mean(duration$duration))` days,
+though there are boxes with `r round(max(duration$duration))` days of activity,
+spanning a large chunk of openSenseMap's existence.
+
+## ...by year of registration
+This is less useful, as older boxes are active for a longer time by definition.
+If you have an idea how to compensate for that, please send a [Pull Request][PR]!
+
+```{r year_duration, message=FALSE}
+# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+```
+
+# More Visualisations
+Other visualisations come to mind, and are left as an exercise to the reader.
+If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
+
+* growth by phenomenon
+* growth by location -> (interactive) map
+* set inactive rate in relation to total box count
+* filter timespans with big dips in growth rate, and extrapolate the amount of
+  senseBoxes that could be on the platform today, assuming there were no production issues ;)
+
+[PR]: https://github.com/sensebox/opensensmapr/pulls
--- a/inst/doc/osem-history.html
+++ b/inst/doc/osem-history.html
--- a/inst/doc/osem-history_revised.R
+++ b/inst/doc/osem-history_revised.R
@ -0,0 +1,162 @@
+## ----setup, results='hide', message=FALSE, warning=FALSE----------------------
+# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+
+## ----download, results='hide', message=FALSE, warning=FALSE-------------------
+# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes_all = osem_boxes()
+boxes = boxes_all
+
+## -----------------------------------------------------------------------------
+boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
+summary(boxes) -> summary.data.frame
+
+## ----message=F, warning=F-----------------------------------------------------
+if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(boxes)
+
+## -----------------------------------------------------------------------------
+phenoms = osem_phenomena(boxes)
+str(phenoms)
+
+## -----------------------------------------------------------------------------
+phenoms[phenoms > 50]
+
+## ----exposure_counts, message=FALSE-------------------------------------------
+exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(locationtimestamp))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+
+## ----exposure_summary---------------------------------------------------------
+exposure_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+## ----grouptag_counts, message=FALSE-------------------------------------------
+grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 15 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
+  mutate(count = row_number(locationtimestamp))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+
+## ----grouptag_summary---------------------------------------------------------
+grouptag_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------
+bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+
+## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'--------
+inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(lastMeasurement < now() - days(2)) %>%
+  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
+  filter(as.Date(week) > as.Date("2021-12-31")) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+
+## ----growthrate, warning=FALSE, message=FALSE, results='hide'-----------------
+boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+
+## ----table_mostregistrations--------------------------------------------------
+boxes_by_date %>%
+  filter(count > 50) %>%
+  arrange(desc(count))
+
+## ----exposure_duration, message=FALSE-----------------------------------------
+durations = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(durations, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+## ----grouptag_duration, message=FALSE-----------------------------------------
+durations = boxes %>%
+  filter(!is.na(lastMeasurement)) %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 20 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+  
+ggplot(durations, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+durations %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+
+## ----year_duration, message=FALSE---------------------------------------------
+# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+
--- a/inst/doc/osem-history_revised.Rmd
+++ b/inst/doc/osem-history_revised.Rmd
@ -0,0 +1,300 @@
+---
+title: "Visualising the Development of openSenseMap.org in 2022"
+author: "Jan Stenkamp"
+date: '`r Sys.Date()`'
+output:
+  html_document:
+    code_folding: hide
+    df_print: kable
+    theme: lumen
+    toc: yes
+    toc_float: yes
+  rmarkdown::html_vignette:
+    df_print: kable
+    fig_height: 5
+    fig_width: 7
+    toc: yes
+vignette: >
+  %\VignetteIndexEntry{Visualising the Development of openSenseMap.org in 2022}
+  %\VignetteEncoding{UTF-8}
+  %\VignetteEngine{knitr::rmarkdown}
+---
+
+> This vignette serves as an example on data wrangling & visualization with
+`opensensmapr`, `dplyr` and `ggplot2`.
+
+```{r setup, results='hide', message=FALSE, warning=FALSE}
+# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+```
+
+openSenseMap.org has grown quite a bit in the last years; it would be interesting
+to see how we got to the current `r osem_counts()$boxes` sensor stations,
+split up by various attributes of the boxes.
+
+While `opensensmapr` provides extensive methods of filtering boxes by attributes
+on the server, we do the filtering within R to save time and gain flexibility.
+
+
+So the first step is to retrieve *all the boxes*. 
+
+```{r download, results='hide', message=FALSE, warning=FALSE}
+# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes_all = osem_boxes()
+boxes = boxes_all
+```
+# Introduction
+In the following we just want to have a look at the boxes created in 2022, so we filter for them. 
+
+```{r}
+boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
+summary(boxes) -> summary.data.frame
+```
+
+<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
+<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
+<!-- outdoors and have around 5 sensors each. -->
+<!-- The oldest station is from August 2016, while the latest station was registered a -->
+<!-- couple of minutes ago. -->
+
+Another feature of interest is the spatial distribution of the boxes: `plot()`
+can help us out here. This function requires a bunch of optional dependencies though.
+
+```{r message=F, warning=F}
+if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(boxes)
+```
+
+But what do these sensor stations actually measure? Lets find out.
+`osem_phenomena()` gives us a named list of of the counts of each observed
+phenomenon for the given set of sensor stations:
+
+```{r}
+phenoms = osem_phenomena(boxes)
+str(phenoms)
+```
+
+Thats quite some noise there, with many phenomena being measured by a single
+sensor only, or many duplicated phenomena due to slightly different spellings.
+We should clean that up, but for now let's just filter out the noise and find
+those phenomena with high sensor numbers:
+
+```{r}
+phenoms[phenoms > 50]
+```
+
+
+# Plot count of boxes by time {.tabset}
+By looking at the `createdAt` attribute of each box we know the exact time a box
+was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
+
+With this approach we have no information about boxes that were deleted in the
+meantime, but that's okay for now.
+
+## ...and exposure
+```{r exposure_counts, message=FALSE}
+exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(locationtimestamp))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+```
+
+Outdoor boxes are growing *fast*!
+We can also see the introduction of `mobile` sensor "stations" in 2017. 
+
+Let's have a quick summary:
+```{r exposure_summary}
+exposure_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+```
+
+## ...and grouptag
+We can try to find out where the increases in growth came from, by analysing the 
+box count by grouptag.
+
+Caveats: Only a small subset of boxes has a grouptag, and we should assume
+that these groups are actually bigger. Also, we can see that grouptag naming is
+inconsistent (`Luftdaten`, `luftdaten.info`, ...)
+
+```{r grouptag_counts, message=FALSE}
+grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 15 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
+  mutate(count = row_number(locationtimestamp))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+```
+
+```{r grouptag_summary}
+grouptag_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+```
+
+# Plot rate of growth and inactivity per week
+First we group the boxes by `locationtimestamp` into bins of one week:
+```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
+bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+```
+
+We can do the same for `updatedAt`, which informs us about the last change to
+a box, including uploaded measurements. As a lot of boxes were "updated" by the database
+migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` 
+attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes 
+boxes which were created but never made a measurement.
+
+This method of determining inactive boxes is fairly inaccurate and should be
+considered an approximation, because we have no information about intermediate
+inactive phases.
+Also deleted boxes would probably have a big impact here.
+```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
+inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(lastMeasurement < now() - days(2)) %>%
+  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
+  filter(as.Date(week) > as.Date("2021-12-31")) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+```
+
+Now we can combine both datasets for plotting:
+```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
+boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+```
+
+And see in which weeks the most boxes become (in)active:
+```{r table_mostregistrations}
+boxes_by_date %>%
+  filter(count > 50) %>%
+  arrange(desc(count))
+```
+
+# Plot duration of boxes being active {.tabset}
+While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
+of each box, and look at metrics by exposure and grouptag once more:
+
+## ...by exposure
+```{r exposure_duration, message=FALSE}
+durations = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(durations, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+```
+
+The time of activity averages at only `r round(mean(durations$duration))` days,
+though there are boxes with `r round(max(durations$duration))` days of activity,
+spanning a large chunk of openSenseMap's existence.
+
+## ...by grouptag
+```{r grouptag_duration, message=FALSE}
+durations = boxes %>%
+  filter(!is.na(lastMeasurement)) %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 20 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+  
+ggplot(durations, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+
+durations %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+```
+
+The time of activity averages at only `r round(mean(durations$duration))` days,
+though there are boxes with `r round(max(durations$duration))` days of activity,
+spanning a large chunk of openSenseMap's existence.
+
+## ...by year of registration
+This is less useful, as older boxes are active for a longer time by definition.
+If you have an idea how to compensate for that, please send a [Pull Request][PR]!
+
+```{r year_duration, message=FALSE}
+# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+```
+
+# More Visualisations
+Other visualisations come to mind, and are left as an exercise to the reader.
+If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
+
+* growth by phenomenon
+* growth by location -> (interactive) map
+* set inactive rate in relation to total box count
+* filter timespans with big dips in growth rate, and extrapolate the amount of
+  senseBoxes that could be on the platform today, assuming there were no production issues ;)
+
+[PR]: https://github.com/sensebox/opensensmapr/pulls
+
+
--- a/inst/doc/osem-history_revised.html
+++ b/inst/doc/osem-history_revised.html
--- a/inst/doc/osem-intro.R
+++ b/inst/doc/osem-intro.R
@ -0,0 +1,73 @@
+## ----setup, include=FALSE-----------------------------------------------------
+knitr::opts_chunk$set(echo = TRUE)
+
+## ----results = F--------------------------------------------------------------
+library(magrittr)
+library(opensensmapr)
+
+all_sensors = osem_boxes()
+
+## -----------------------------------------------------------------------------
+summary(all_sensors)
+
+## ----message=F, warning=F-----------------------------------------------------
+if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(all_sensors)
+
+## -----------------------------------------------------------------------------
+phenoms = osem_phenomena(all_sensors)
+str(phenoms)
+
+## -----------------------------------------------------------------------------
+phenoms[phenoms > 20]
+
+## ----results = F--------------------------------------------------------------
+pm25_sensors = osem_boxes(
+  exposure = 'outdoor',
+  date = Sys.time(), # ±4 hours
+  phenomenon = 'PM2.5'
+)
+
+## -----------------------------------------------------------------------------
+summary(pm25_sensors)
+plot(pm25_sensors)
+
+## -----------------------------------------------------------------------------
+library(sf)
+library(units)
+library(lubridate)
+library(dplyr)
+
+# construct a bounding box: 12 kilometers around Berlin
+berlin = st_point(c(13.4034, 52.5120)) %>%
+  st_sfc(crs = 4326) %>%
+  st_transform(3857) %>% # allow setting a buffer in meters
+  st_buffer(set_units(12, km)) %>%
+  st_transform(4326) %>% # the opensensemap expects WGS 84
+  st_bbox()
+
+## ----results = F--------------------------------------------------------------
+pm25 = osem_measurements(
+  berlin,
+  phenomenon = 'PM2.5',
+  from = now() - days(3), # defaults to 2 days
+  to = now()
+)
+
+plot(pm25)
+
+## -----------------------------------------------------------------------------
+outliers = filter(pm25, value > 100)$sensorId
+bad_sensors = outliers[, drop = T] %>% levels()
+
+pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
+
+## -----------------------------------------------------------------------------
+st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
+
+## -----------------------------------------------------------------------------
+pm25 %>% filter(invalid == FALSE) %>% plot()
+
--- a/inst/doc/osem-intro.Rmd
+++ b/inst/doc/osem-intro.Rmd
@ -0,0 +1,151 @@
+---
+title: "Exploring the openSenseMap Dataset"
+author: "Norwin Roosen"
+date: "`r Sys.Date()`"
+output:
+  rmarkdown::html_vignette:
+    fig_margin: 0
+    fig_width: 6
+    fig_height: 4
+vignette: >
+  %\VignetteIndexEntry{Exploring the openSenseMap Dataset}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+This package provides data ingestion functions for almost any data stored on the
+open data platform for environmental sensordata <https://opensensemap.org>.
+Its main goals are to provide means for:
+
+- big data analysis of the measurements stored on the platform
+- sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
+
+### Exploring the dataset
+Before we look at actual observations, lets get a grasp of the openSenseMap
+datasets' structure.
+
+```{r results = F}
+library(magrittr)
+library(opensensmapr)
+
+all_sensors = osem_boxes()
+```
+```{r}
+summary(all_sensors)
+```
+
+This gives a good overview already: As of writing this, there are more than 700
+sensor stations, of which ~50% are currently running. Most of them are placed
+outdoors and have around 5 sensors each.
+The oldest station is from May 2014, while the latest station was registered a
+couple of minutes ago.
+
+Another feature of interest is the spatial distribution of the boxes: `plot()`
+can help us out here. This function requires a bunch of optional dependencies though.
+
+```{r message=F, warning=F}
+if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(all_sensors)
+```
+
+It seems we have to reduce our area of interest to Germany.
+
+But what do these sensor stations actually measure? Lets find out.
+`osem_phenomena()` gives us a named list of of the counts of each observed
+phenomenon for the given set of sensor stations:
+
+```{r}
+phenoms = osem_phenomena(all_sensors)
+str(phenoms)
+```
+
+Thats quite some noise there, with many phenomena being measured by a single
+sensor only, or many duplicated phenomena due to slightly different spellings.
+We should clean that up, but for now let's just filter out the noise and find
+those phenomena with high sensor numbers:
+
+```{r}
+phenoms[phenoms > 20]
+```
+
+Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
+interesting to analyze though. 
+We should check how many sensor stations provide useful data: We want only those
+boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
+measurements:
+
+```{r results = F}
+pm25_sensors = osem_boxes(
+  exposure = 'outdoor',
+  date = Sys.time(), # ±4 hours
+  phenomenon = 'PM2.5'
+)
+```
+```{r}
+summary(pm25_sensors)
+plot(pm25_sensors)
+```
+
+Thats still more than 200 measuring stations, we can work with that.
+
+### Analyzing sensor data
+Having analyzed the available data sources, let's finally get some measurements.
+We could call `osem_measurements(pm25_sensors)` now, however we are focusing on
+a restricted area of interest, the city of Berlin.
+Luckily we can get the measurements filtered by a bounding box:
+
+```{r}
+library(sf)
+library(units)
+library(lubridate)
+library(dplyr)
+
+# construct a bounding box: 12 kilometers around Berlin
+berlin = st_point(c(13.4034, 52.5120)) %>%
+  st_sfc(crs = 4326) %>%
+  st_transform(3857) %>% # allow setting a buffer in meters
+  st_buffer(set_units(12, km)) %>%
+  st_transform(4326) %>% # the opensensemap expects WGS 84
+  st_bbox()
+```
+```{r results = F}
+pm25 = osem_measurements(
+  berlin,
+  phenomenon = 'PM2.5',
+  from = now() - days(3), # defaults to 2 days
+  to = now()
+)
+
+plot(pm25)
+```
+
+Now we can get started with actual spatiotemporal data analysis.
+First, lets mask the seemingly uncalibrated sensors:
+
+```{r}
+outliers = filter(pm25, value > 100)$sensorId
+bad_sensors = outliers[, drop = T] %>% levels()
+
+pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
+```
+
+Then plot the measuring locations, flagging the outliers:
+
+```{r}
+st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
+```
+
+Removing these sensors yields a nicer time series plot:
+
+```{r}
+pm25 %>% filter(invalid == FALSE) %>% plot()
+```
+
+Further analysis: comparison with LANUV data `TODO`
--- a/inst/doc/osem-intro.html
+++ b/inst/doc/osem-intro.html
--- a/inst/doc/osem-serialization.R
+++ b/inst/doc/osem-serialization.R
@ -0,0 +1,51 @@
+## ----setup, results='hide'----------------------------------------------------
+# this vignette requires:
+library(opensensmapr)
+library(jsonlite)
+library(readr)
+
+## ----cache--------------------------------------------------------------------
+b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# requests without the cache parameter will still be performed normally
+b = osem_boxes(grouptag = 'ifgi')
+
+## ----cachelisting-------------------------------------------------------------
+list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
+
+## ----cache_custom-------------------------------------------------------------
+cacheDir = getwd() # current working directory
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+
+## ----clearcache, results='hide'-----------------------------------------------
+osem_clear_cache()        # clears default cache
+osem_clear_cache(getwd()) # clears a custom cache
+
+## ----data, results='hide'-----------------------------------------------------
+# first get our example data:
+measurements = osem_measurements('Windgeschwindigkeit')
+
+## ----serialize_json-----------------------------------------------------------
+# serializing senseBoxes to JSON, and loading from file again:
+write(jsonlite::serializeJSON(measurements), 'measurements.json')
+measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
+class(measurements_from_file)
+
+## ----serialize_attrs----------------------------------------------------------
+# note the toJSON call instead of serializeJSON
+write(jsonlite::toJSON(measurements), 'measurements_bad.json')
+measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
+class(measurements_without_attrs)
+
+measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
+class(measurements_with_attrs)
+
+## ----cleanup, include=FALSE---------------------------------------------------
+file.remove('measurements.json', 'measurements_bad.json')
+
--- a/inst/doc/osem-serialization.Rmd
+++ b/inst/doc/osem-serialization.Rmd
@ -0,0 +1,106 @@
+---
+title: "Caching openSenseMap Data for Reproducibility"
+author: "Norwin Roosen"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+It may be useful to download data from openSenseMap only once.
+For reproducible results, the data should be saved to disk, and reloaded at a
+later point.
+
+This avoids..
+
+- changed results for queries without date parameters,
+- unnecessary wait times,
+- risk of API changes / API unavailability,
+- stress on the openSenseMap-server.
+
+This vignette shows how to use this built in `opensensmapr` feature, and
+how to do it yourself in case you want to save to other data formats.
+
+```{r setup, results='hide'}
+# this vignette requires:
+library(opensensmapr)
+library(jsonlite)
+library(readr)
+```
+
+## Using the opensensmapr Caching Feature
+All data retrieval functions of `opensensmapr` have a built in caching feature,
+which serializes an API response to disk.
+Subsequent identical requests will then return the serialized data instead of making
+another request.
+
+To use this feature, just add a path to a directory to the `cache` parameter:
+```{r cache}
+b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# requests without the cache parameter will still be performed normally
+b = osem_boxes(grouptag = 'ifgi')
+```
+
+Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
+```{r cachelisting}
+list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
+```
+
+You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
+```{r cache_custom}
+cacheDir = getwd() # current working directory
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+```
+
+To get fresh results again, just call `osem_clear_cache()` for the respective cache:
+```{r clearcache, results='hide'}
+osem_clear_cache()        # clears default cache
+osem_clear_cache(getwd()) # clears a custom cache
+```
+
+## Custom (De-) Serialization
+If you want to roll your own serialization method to support custom data formats,
+here's how:
+
+```{r data, results='hide'}
+# first get our example data:
+measurements = osem_measurements('Windgeschwindigkeit')
+```
+
+If you are paranoid and worry about `.rds` files not being decodable anymore
+in the (distant) future, you could serialize to a plain text format such as JSON.
+This of course comes at the cost of storage space and performance.
+```{r serialize_json}
+# serializing senseBoxes to JSON, and loading from file again:
+write(jsonlite::serializeJSON(measurements), 'measurements.json')
+measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
+class(measurements_from_file)
+```
+
+This method also persists the R object metadata (classes, attributes).
+If you were to use a serialization method that can't persist object metadata, you
+could re-apply it with the following functions:
+
+```{r serialize_attrs}
+# note the toJSON call instead of serializeJSON
+write(jsonlite::toJSON(measurements), 'measurements_bad.json')
+measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
+class(measurements_without_attrs)
+
+measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
+class(measurements_with_attrs)
+```
+The same goes for boxes via `osem_as_sensebox()`.
+
+```{r cleanup, include=FALSE}
+file.remove('measurements.json', 'measurements_bad.json')
+```
--- a/inst/doc/osem-serialization.html
+++ b/inst/doc/osem-serialization.html
@ -0,0 +1,444 @@
+<!DOCTYPE html>
+
+<html>
+
+<head>
+
+<meta charset="utf-8" />
+<meta name="generator" content="pandoc" />
+<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+
+<meta name="author" content="Norwin Roosen" />
+
+<meta name="date" content="2023-02-23" />
+
+<title>Caching openSenseMap Data for Reproducibility</title>
+
+<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
+// be compatible with the behavior of Pandoc < 2.8).
+document.addEventListener('DOMContentLoaded', function(e) {
+  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
+  var i, h, a;
+  for (i = 0; i < hs.length; i++) {
+    h = hs[i];
+    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
+    a = h.attributes;
+    while (a.length > 0) h.removeAttribute(a[0].name);
+  }
+});
+</script>
+
+<style type="text/css">
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+span.underline{text-decoration: underline;}
+div.column{display: inline-block; vertical-align: top; width: 50%;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+</style>
+
+
+
+<style type="text/css">
+code {
+white-space: pre;
+}
+.sourceCode {
+overflow: visible;
+}
+</style>
+<style type="text/css" data-origin="pandoc">
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+{ counter-reset: source-line 0; }
+pre.numberSource code > span
+{ position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+{ content: counter(source-line);
+position: relative; left: -1em; text-align: right; vertical-align: baseline;
+border: none; display: inline-block;
+-webkit-touch-callout: none; -webkit-user-select: none;
+-khtml-user-select: none; -moz-user-select: none;
+-ms-user-select: none; user-select: none;
+padding: 0 4px; width: 4em;
+color: #aaaaaa;
+}
+pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
+div.sourceCode
+{ }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+code span.al { color: #ff0000; font-weight: bold; } 
+code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } 
+code span.at { color: #7d9029; } 
+code span.bn { color: #40a070; } 
+code span.bu { color: #008000; } 
+code span.cf { color: #007020; font-weight: bold; } 
+code span.ch { color: #4070a0; } 
+code span.cn { color: #880000; } 
+code span.co { color: #60a0b0; font-style: italic; } 
+code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } 
+code span.do { color: #ba2121; font-style: italic; } 
+code span.dt { color: #902000; } 
+code span.dv { color: #40a070; } 
+code span.er { color: #ff0000; font-weight: bold; } 
+code span.ex { } 
+code span.fl { color: #40a070; } 
+code span.fu { color: #06287e; } 
+code span.im { color: #008000; font-weight: bold; } 
+code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } 
+code span.kw { color: #007020; font-weight: bold; } 
+code span.op { color: #666666; } 
+code span.ot { color: #007020; } 
+code span.pp { color: #bc7a00; } 
+code span.sc { color: #4070a0; } 
+code span.ss { color: #bb6688; } 
+code span.st { color: #4070a0; } 
+code span.va { color: #19177c; } 
+code span.vs { color: #4070a0; } 
+code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } 
+</style>
+<script>
+// apply pandoc div.sourceCode style to pre.sourceCode instead
+(function() {
+  var sheets = document.styleSheets;
+  for (var i = 0; i < sheets.length; i++) {
+    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
+    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
+    var j = 0;
+    while (j < rules.length) {
+      var rule = rules[j];
+      // check if there is a div.sourceCode rule
+      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
+        j++;
+        continue;
+      }
+      var style = rule.style.cssText;
+      // check if color or background-color is set
+      if (rule.style.color === '' && rule.style.backgroundColor === '') {
+        j++;
+        continue;
+      }
+      // replace div.sourceCode by a pre.sourceCode rule
+      sheets[i].deleteRule(j);
+      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
+    }
+  }
+})();
+</script>
+
+
+
+
+<style type="text/css">body {
+background-color: #fff;
+margin: 1em auto;
+max-width: 700px;
+overflow: visible;
+padding-left: 2em;
+padding-right: 2em;
+font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+font-size: 14px;
+line-height: 1.35;
+}
+#TOC {
+clear: both;
+margin: 0 0 10px 10px;
+padding: 4px;
+width: 400px;
+border: 1px solid #CCCCCC;
+border-radius: 5px;
+background-color: #f6f6f6;
+font-size: 13px;
+line-height: 1.3;
+}
+#TOC .toctitle {
+font-weight: bold;
+font-size: 15px;
+margin-left: 5px;
+}
+#TOC ul {
+padding-left: 40px;
+margin-left: -1.5em;
+margin-top: 5px;
+margin-bottom: 5px;
+}
+#TOC ul ul {
+margin-left: -2em;
+}
+#TOC li {
+line-height: 16px;
+}
+table {
+margin: 1em auto;
+border-width: 1px;
+border-color: #DDDDDD;
+border-style: outset;
+border-collapse: collapse;
+}
+table th {
+border-width: 2px;
+padding: 5px;
+border-style: inset;
+}
+table td {
+border-width: 1px;
+border-style: inset;
+line-height: 18px;
+padding: 5px 5px;
+}
+table, table th, table td {
+border-left-style: none;
+border-right-style: none;
+}
+table thead, table tr.even {
+background-color: #f7f7f7;
+}
+p {
+margin: 0.5em 0;
+}
+blockquote {
+background-color: #f6f6f6;
+padding: 0.25em 0.75em;
+}
+hr {
+border-style: solid;
+border: none;
+border-top: 1px solid #777;
+margin: 28px 0;
+}
+dl {
+margin-left: 0;
+}
+dl dd {
+margin-bottom: 13px;
+margin-left: 13px;
+}
+dl dt {
+font-weight: bold;
+}
+ul {
+margin-top: 0;
+}
+ul li {
+list-style: circle outside;
+}
+ul ul {
+margin-bottom: 0;
+}
+pre, code {
+background-color: #f7f7f7;
+border-radius: 3px;
+color: #333;
+white-space: pre-wrap; 
+}
+pre {
+border-radius: 3px;
+margin: 5px 0px 10px 0px;
+padding: 10px;
+}
+pre:not([class]) {
+background-color: #f7f7f7;
+}
+code {
+font-family: Consolas, Monaco, 'Courier New', monospace;
+font-size: 85%;
+}
+p > code, li > code {
+padding: 2px 0px;
+}
+div.figure {
+text-align: center;
+}
+img {
+background-color: #FFFFFF;
+padding: 2px;
+border: 1px solid #DDDDDD;
+border-radius: 3px;
+border: 1px solid #CCCCCC;
+margin: 0 5px;
+}
+h1 {
+margin-top: 0;
+font-size: 35px;
+line-height: 40px;
+}
+h2 {
+border-bottom: 4px solid #f7f7f7;
+padding-top: 10px;
+padding-bottom: 2px;
+font-size: 145%;
+}
+h3 {
+border-bottom: 2px solid #f7f7f7;
+padding-top: 10px;
+font-size: 120%;
+}
+h4 {
+border-bottom: 1px solid #f7f7f7;
+margin-left: 8px;
+font-size: 105%;
+}
+h5, h6 {
+border-bottom: 1px solid #ccc;
+font-size: 105%;
+}
+a {
+color: #0033dd;
+text-decoration: none;
+}
+a:hover {
+color: #6666ff; }
+a:visited {
+color: #800080; }
+a:visited:hover {
+color: #BB00BB; }
+a[href^="http:"] {
+text-decoration: underline; }
+a[href^="https:"] {
+text-decoration: underline; }
+
+code > span.kw { color: #555; font-weight: bold; } 
+code > span.dt { color: #902000; } 
+code > span.dv { color: #40a070; } 
+code > span.bn { color: #d14; } 
+code > span.fl { color: #d14; } 
+code > span.ch { color: #d14; } 
+code > span.st { color: #d14; } 
+code > span.co { color: #888888; font-style: italic; } 
+code > span.ot { color: #007020; } 
+code > span.al { color: #ff0000; font-weight: bold; } 
+code > span.fu { color: #900; font-weight: bold; } 
+code > span.er { color: #a61717; background-color: #e3d2d2; } 
+</style>
+
+
+
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">Caching openSenseMap Data for
+Reproducibility</h1>
+<h4 class="author">Norwin Roosen</h4>
+<h4 class="date">2023-02-23</h4>
+
+
+
+<p>It may be useful to download data from openSenseMap only once. For
+reproducible results, the data should be saved to disk, and reloaded at
+a later point.</p>
+<p>This avoids..</p>
+<ul>
+<li>changed results for queries without date parameters,</li>
+<li>unnecessary wait times,</li>
+<li>risk of API changes / API unavailability,</li>
+<li>stress on the openSenseMap-server.</li>
+</ul>
+<p>This vignette shows how to use this built in
+<code>opensensmapr</code> feature, and how to do it yourself in case you
+want to save to other data formats.</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># this vignette requires:</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(opensensmapr)</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(jsonlite)</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(readr)</span></code></pre></div>
+<div id="using-the-opensensmapr-caching-feature" class="section level2">
+<h2>Using the opensensmapr Caching Feature</h2>
+<p>All data retrieval functions of <code>opensensmapr</code> have a
+built in caching feature, which serializes an API response to disk.
+Subsequent identical requests will then return the serialized data
+instead of making another request.</p>
+<p>To use this feature, just add a path to a directory to the
+<code>cache</code> parameter:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">&#39;ifgi&#39;</span>, <span class="at">cache =</span> <span class="fu">tempdir</span>())</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># the next identical request will hit the cache only!</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">&#39;ifgi&#39;</span>, <span class="at">cache =</span> <span class="fu">tempdir</span>())</span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co"># requests without the cache parameter will still be performed normally</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">&#39;ifgi&#39;</span>)</span></code></pre></div>
+<p>Looking at the cache directory we can see one file for each request,
+which is identified through a hash of the request URL:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">list.files</span>(<span class="fu">tempdir</span>(), <span class="at">pattern =</span> <span class="st">&#39;osemcache</span><span class="sc">\\</span><span class="st">..*</span><span class="sc">\\</span><span class="st">.rds&#39;</span>)</span></code></pre></div>
+<pre><code>## [1] &quot;osemcache.17db5c57fc6fca4d836fa2cf30345ce8767cd61a.rds&quot;</code></pre>
+<p>You can maintain multiple caches simultaneously which allows to only
+store data related to a script in the same directory:</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>cacheDir <span class="ot">=</span> <span class="fu">getwd</span>() <span class="co"># current working directory</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">&#39;ifgi&#39;</span>, <span class="at">cache =</span> cacheDir)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># the next identical request will hit the cache only!</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">&#39;ifgi&#39;</span>, <span class="at">cache =</span> cacheDir)</span></code></pre></div>
+<p>To get fresh results again, just call <code>osem_clear_cache()</code>
+for the respective cache:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">osem_clear_cache</span>()        <span class="co"># clears default cache</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="fu">osem_clear_cache</span>(<span class="fu">getwd</span>()) <span class="co"># clears a custom cache</span></span></code></pre></div>
+</div>
+<div id="custom-de--serialization" class="section level2">
+<h2>Custom (De-) Serialization</h2>
+<p>If you want to roll your own serialization method to support custom
+data formats, here’s how:</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># first get our example data:</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>measurements <span class="ot">=</span> <span class="fu">osem_measurements</span>(<span class="st">&#39;Windgeschwindigkeit&#39;</span>)</span></code></pre></div>
+<p>If you are paranoid and worry about <code>.rds</code> files not being
+decodable anymore in the (distant) future, you could serialize to a
+plain text format such as JSON. This of course comes at the cost of
+storage space and performance.</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># serializing senseBoxes to JSON, and loading from file again:</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="fu">write</span>(jsonlite<span class="sc">::</span><span class="fu">serializeJSON</span>(measurements), <span class="st">&#39;measurements.json&#39;</span>)</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>measurements_from_file <span class="ot">=</span> jsonlite<span class="sc">::</span><span class="fu">unserializeJSON</span>(readr<span class="sc">::</span><span class="fu">read_file</span>(<span class="st">&#39;measurements.json&#39;</span>))</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_from_file)</span></code></pre></div>
+<pre><code>## [1] &quot;osem_measurements&quot; &quot;tbl_df&quot;            &quot;tbl&quot;              
+## [4] &quot;data.frame&quot;</code></pre>
+<p>This method also persists the R object metadata (classes,
+attributes). If you were to use a serialization method that can’t
+persist object metadata, you could re-apply it with the following
+functions:</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># note the toJSON call instead of serializeJSON</span></span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="fu">write</span>(jsonlite<span class="sc">::</span><span class="fu">toJSON</span>(measurements), <span class="st">&#39;measurements_bad.json&#39;</span>)</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>measurements_without_attrs <span class="ot">=</span> jsonlite<span class="sc">::</span><span class="fu">fromJSON</span>(<span class="st">&#39;measurements_bad.json&#39;</span>)</span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_without_attrs)</span></code></pre></div>
+<pre><code>## [1] &quot;data.frame&quot;</code></pre>
+<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>measurements_with_attrs <span class="ot">=</span> <span class="fu">osem_as_measurements</span>(measurements_without_attrs)</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_with_attrs)</span></code></pre></div>
+<pre><code>## [1] &quot;osem_measurements&quot; &quot;tbl_df&quot;            &quot;tbl&quot;              
+## [4] &quot;data.frame&quot;</code></pre>
+<p>The same goes for boxes via <code>osem_as_sensebox()</code>.</p>
+</div>
+
+
+
+<!-- code folding -->
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+  (function () {
+    var script = document.createElement("script");
+    script.type = "text/javascript";
+    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+    document.getElementsByTagName("head")[0].appendChild(script);
+  })();
+</script>
+
+</body>
+</html>