remove inst/doc files
parent
b26ca150a9
commit
62667ef139
@ -1,133 +0,0 @@
|
|||||||
## ----setup, results='hide', message=FALSE, warning=FALSE-----------------
|
|
||||||
# required packages:
|
|
||||||
library(opensensmapr) # data download
|
|
||||||
library(dplyr) # data wrangling
|
|
||||||
library(ggplot2) # plotting
|
|
||||||
library(lubridate) # date arithmetic
|
|
||||||
library(zoo) # rollmean()
|
|
||||||
|
|
||||||
## ----download------------------------------------------------------------
|
|
||||||
# if you want to see results for a specific subset of boxes,
|
|
||||||
# just specify a filter such as grouptag='ifgi' here
|
|
||||||
boxes = osem_boxes()
|
|
||||||
|
|
||||||
## ----exposure_counts, message=FALSE--------------------------------------
|
|
||||||
exposure_counts = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
mutate(count = row_number(createdAt))
|
|
||||||
|
|
||||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
|
||||||
ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
|
|
||||||
geom_line() +
|
|
||||||
scale_colour_manual(values = exposure_colors) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
|
|
||||||
## ----exposure_summary----------------------------------------------------
|
|
||||||
exposure_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(createdAt),
|
|
||||||
newest = max(createdAt),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
|
|
||||||
## ----grouptag_counts, message=FALSE--------------------------------------
|
|
||||||
grouptag_counts = boxes %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 8 or more members
|
|
||||||
filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
|
|
||||||
mutate(count = row_number(createdAt))
|
|
||||||
|
|
||||||
# helper for sorting the grouptags by boxcount
|
|
||||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
|
||||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
|
||||||
factor(oldFactor, levels = lvls)
|
|
||||||
}
|
|
||||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
|
||||||
|
|
||||||
ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
|
|
||||||
geom_line(aes(group = grouptag)) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
|
|
||||||
## ----grouptag_summary----------------------------------------------------
|
|
||||||
grouptag_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(createdAt),
|
|
||||||
newest = max(createdAt),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
|
|
||||||
## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'----
|
|
||||||
bins = 'week'
|
|
||||||
mvavg_bins = 6
|
|
||||||
|
|
||||||
growth = boxes %>%
|
|
||||||
mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'registered')
|
|
||||||
|
|
||||||
## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'----
|
|
||||||
inactive = boxes %>%
|
|
||||||
# remove boxes that were updated in the last two days,
|
|
||||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
|
||||||
filter(updatedAt < now() - days(2)) %>%
|
|
||||||
mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'inactive')
|
|
||||||
|
|
||||||
## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------
|
|
||||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
|
||||||
|
|
||||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
|
||||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
|
||||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
|
||||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
|
||||||
geom_point(aes(y = count), size = 0.5) +
|
|
||||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
|
||||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
|
||||||
|
|
||||||
## ----exposure_duration, message=FALSE------------------------------------
|
|
||||||
duration = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
filter(!is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = exposure, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
|
|
||||||
## ----grouptag_duration, message=FALSE------------------------------------
|
|
||||||
duration = boxes %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 8 or more members
|
|
||||||
filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = grouptag, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
|
|
||||||
duration %>%
|
|
||||||
summarize(
|
|
||||||
duration_avg = round(mean(duration)),
|
|
||||||
duration_min = round(min(duration)),
|
|
||||||
duration_max = round(max(duration)),
|
|
||||||
oldest_box = round(max(difftime(now(), createdAt, units='days')))
|
|
||||||
) %>%
|
|
||||||
arrange(desc(duration_avg))
|
|
||||||
|
|
||||||
## ----year_duration, message=FALSE----------------------------------------
|
|
||||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
|
||||||
duration = boxes %>%
|
|
||||||
mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
|
|
||||||
group_by(year) %>%
|
|
||||||
filter(!is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
|
||||||
|
|
@ -1,243 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Visualising the History of openSenseMap.org"
|
|
||||||
author: "Norwin Roosen"
|
|
||||||
date: '`r Sys.Date()`'
|
|
||||||
output:
|
|
||||||
rmarkdown::html_vignette:
|
|
||||||
df_print: kable
|
|
||||||
fig_height: 5
|
|
||||||
fig_width: 7
|
|
||||||
toc: yes
|
|
||||||
html_document:
|
|
||||||
code_folding: hide
|
|
||||||
df_print: kable
|
|
||||||
theme: lumen
|
|
||||||
toc: yes
|
|
||||||
toc_float: yes
|
|
||||||
vignette: >
|
|
||||||
%\VignetteIndexEntry{Visualising the History of openSenseMap.org}
|
|
||||||
%\VignetteEngine{knitr::rmarkdown}
|
|
||||||
%\VignetteEncoding{UTF-8}
|
|
||||||
---
|
|
||||||
|
|
||||||
> This vignette serves as an example on data wrangling & visualization with
|
|
||||||
`opensensmapr`, `dplyr` and `ggplot2`.
|
|
||||||
|
|
||||||
```{r setup, results='hide', message=FALSE, warning=FALSE}
|
|
||||||
# required packages:
|
|
||||||
library(opensensmapr) # data download
|
|
||||||
library(dplyr) # data wrangling
|
|
||||||
library(ggplot2) # plotting
|
|
||||||
library(lubridate) # date arithmetic
|
|
||||||
library(zoo) # rollmean()
|
|
||||||
```
|
|
||||||
|
|
||||||
openSenseMap.org has grown quite a bit in the last years; it would be interesting
|
|
||||||
to see how we got to the current `r osem_counts()$boxes` sensor stations,
|
|
||||||
split up by various attributes of the boxes.
|
|
||||||
|
|
||||||
While `opensensmapr` provides extensive methods of filtering boxes by attributes
|
|
||||||
on the server, we do the filtering within R to save time and gain flexibility.
|
|
||||||
So the first step is to retrieve *all the boxes*:
|
|
||||||
|
|
||||||
```{r download}
|
|
||||||
# if you want to see results for a specific subset of boxes,
|
|
||||||
# just specify a filter such as grouptag='ifgi' here
|
|
||||||
boxes = osem_boxes()
|
|
||||||
```
|
|
||||||
|
|
||||||
# Plot count of boxes by time {.tabset}
|
|
||||||
By looking at the `createdAt` attribute of each box we know the exact time a box
|
|
||||||
was registered.
|
|
||||||
With this approach we have no information about boxes that were deleted in the
|
|
||||||
meantime, but that's okay for now.
|
|
||||||
|
|
||||||
## ...and exposure
|
|
||||||
```{r exposure_counts, message=FALSE}
|
|
||||||
exposure_counts = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
mutate(count = row_number(createdAt))
|
|
||||||
|
|
||||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
|
||||||
ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
|
|
||||||
geom_line() +
|
|
||||||
scale_colour_manual(values = exposure_colors) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
```
|
|
||||||
|
|
||||||
Outdoor boxes are growing *fast*!
|
|
||||||
We can also see the introduction of `mobile` sensor "stations" in 2017. While
|
|
||||||
mobile boxes are still few, we can expect a quick rise in 2018 once the new
|
|
||||||
[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU).
|
|
||||||
|
|
||||||
Let's have a quick summary:
|
|
||||||
```{r exposure_summary}
|
|
||||||
exposure_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(createdAt),
|
|
||||||
newest = max(createdAt),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
```
|
|
||||||
|
|
||||||
## ...and grouptag
|
|
||||||
We can try to find out where the increases in growth came from, by analysing the
|
|
||||||
box count by grouptag.
|
|
||||||
|
|
||||||
Caveats: Only a small subset of boxes has a grouptag, and we should assume
|
|
||||||
that these groups are actually bigger. Also, we can see that grouptag naming is
|
|
||||||
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
|
|
||||||
|
|
||||||
```{r grouptag_counts, message=FALSE}
|
|
||||||
grouptag_counts = boxes %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 8 or more members
|
|
||||||
filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
|
|
||||||
mutate(count = row_number(createdAt))
|
|
||||||
|
|
||||||
# helper for sorting the grouptags by boxcount
|
|
||||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
|
||||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
|
||||||
factor(oldFactor, levels = lvls)
|
|
||||||
}
|
|
||||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
|
||||||
|
|
||||||
ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
|
|
||||||
geom_line(aes(group = grouptag)) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
```
|
|
||||||
|
|
||||||
```{r grouptag_summary}
|
|
||||||
grouptag_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(createdAt),
|
|
||||||
newest = max(createdAt),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
```
|
|
||||||
|
|
||||||
# Plot rate of growth and inactivity per week
|
|
||||||
First we group the boxes by `createdAt` into bins of one week:
|
|
||||||
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
bins = 'week'
|
|
||||||
mvavg_bins = 6
|
|
||||||
|
|
||||||
growth = boxes %>%
|
|
||||||
mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'registered')
|
|
||||||
```
|
|
||||||
|
|
||||||
We can do the same for `updatedAt`, which informs us about the last change to
|
|
||||||
a box, including uploaded measurements.
|
|
||||||
This method of determining inactive boxes is fairly inaccurate and should be
|
|
||||||
considered an approximation, because we have no information about intermediate
|
|
||||||
inactive phases.
|
|
||||||
Also deleted boxes would probably have a big impact here.
|
|
||||||
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
inactive = boxes %>%
|
|
||||||
# remove boxes that were updated in the last two days,
|
|
||||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
|
||||||
filter(updatedAt < now() - days(2)) %>%
|
|
||||||
mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'inactive')
|
|
||||||
```
|
|
||||||
|
|
||||||
Now we can combine both datasets for plotting:
|
|
||||||
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
|
||||||
|
|
||||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
|
||||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
|
||||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
|
||||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
|
||||||
geom_point(aes(y = count), size = 0.5) +
|
|
||||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
|
||||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
|
||||||
```
|
|
||||||
|
|
||||||
We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
|
|
||||||
This was enabled by an integration of openSenseMap.org into the firmware of the
|
|
||||||
air quality monitoring project [luftdaten.info](https://luftdaten.info).
|
|
||||||
The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
|
|
||||||
of the senseBox hardware, but I have no data on the exact time frames to verify.
|
|
||||||
|
|
||||||
# Plot duration of boxes being active {.tabset}
|
|
||||||
While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity
|
|
||||||
of each box, and look at metrics by exposure and grouptag once more:
|
|
||||||
|
|
||||||
## ...by exposure
|
|
||||||
```{r exposure_duration, message=FALSE}
|
|
||||||
duration = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
filter(!is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = exposure, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
```
|
|
||||||
|
|
||||||
The time of activity averages at only `r round(mean(duration$duration))` days,
|
|
||||||
though there are boxes with `r round(max(duration$duration))` days of activity,
|
|
||||||
spanning a large chunk of openSenseMap's existence.
|
|
||||||
|
|
||||||
## ...by grouptag
|
|
||||||
```{r grouptag_duration, message=FALSE}
|
|
||||||
duration = boxes %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 8 or more members
|
|
||||||
filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = grouptag, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
|
|
||||||
duration %>%
|
|
||||||
summarize(
|
|
||||||
duration_avg = round(mean(duration)),
|
|
||||||
duration_min = round(min(duration)),
|
|
||||||
duration_max = round(max(duration)),
|
|
||||||
oldest_box = round(max(difftime(now(), createdAt, units='days')))
|
|
||||||
) %>%
|
|
||||||
arrange(desc(duration_avg))
|
|
||||||
```
|
|
||||||
|
|
||||||
The time of activity averages at only `r round(mean(duration$duration))` days,
|
|
||||||
though there are boxes with `r round(max(duration$duration))` days of activity,
|
|
||||||
spanning a large chunk of openSenseMap's existence.
|
|
||||||
|
|
||||||
## ...by year of registration
|
|
||||||
This is less useful, as older boxes are active for a longer time by definition.
|
|
||||||
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
|
|
||||||
|
|
||||||
```{r year_duration, message=FALSE}
|
|
||||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
|
||||||
duration = boxes %>%
|
|
||||||
mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
|
|
||||||
group_by(year) %>%
|
|
||||||
filter(!is.na(updatedAt)) %>%
|
|
||||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
|
||||||
```
|
|
||||||
|
|
||||||
# More Visualisations
|
|
||||||
Other visualisations come to mind, and are left as an exercise to the reader.
|
|
||||||
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
|
|
||||||
|
|
||||||
* growth by phenomenon
|
|
||||||
* growth by location -> (interactive) map
|
|
||||||
* set inactive rate in relation to total box count
|
|
||||||
* filter timespans with big dips in growth rate, and extrapolate the amount of
|
|
||||||
senseBoxes that could be on the platform today, assuming there were no production issues ;)
|
|
||||||
|
|
||||||
[PR]: https://github.com/sensebox/opensensmapr/pulls
|
|
File diff suppressed because one or more lines are too long
@ -1,302 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Visualising the Develpment of openSenseMap.org in 2022"
|
|
||||||
author: "Jan Stenkamp"
|
|
||||||
date: '`r Sys.Date()`'
|
|
||||||
output:
|
|
||||||
html_document:
|
|
||||||
code_folding: hide
|
|
||||||
df_print: kable
|
|
||||||
theme: lumen
|
|
||||||
toc: yes
|
|
||||||
toc_float: yes
|
|
||||||
rmarkdown::html_vignette:
|
|
||||||
df_print: kable
|
|
||||||
fig_height: 5
|
|
||||||
fig_width: 7
|
|
||||||
toc: yes
|
|
||||||
vignette: >
|
|
||||||
%\VignetteIndexEntry{Visualising the History of openSenseMap.org}
|
|
||||||
%\VignetteEncoding{UTF-8}
|
|
||||||
%\VignetteEngine{knitr::rmarkdown}
|
|
||||||
---
|
|
||||||
|
|
||||||
> This vignette serves as an example on data wrangling & visualization with
|
|
||||||
`opensensmapr`, `dplyr` and `ggplot2`.
|
|
||||||
|
|
||||||
```{r setup, results='hide', message=FALSE, warning=FALSE}
|
|
||||||
# required packages:
|
|
||||||
# library(opensensmapr) # data download
|
|
||||||
library(devtools)
|
|
||||||
load_all(".")
|
|
||||||
library(dplyr) # data wrangling
|
|
||||||
library(ggplot2) # plotting
|
|
||||||
library(lubridate) # date arithmetic
|
|
||||||
library(zoo) # rollmean()
|
|
||||||
```
|
|
||||||
|
|
||||||
openSenseMap.org has grown quite a bit in the last years; it would be interesting
|
|
||||||
to see how we got to the current `r osem_counts()$boxes` sensor stations,
|
|
||||||
split up by various attributes of the boxes.
|
|
||||||
|
|
||||||
While `opensensmapr` provides extensive methods of filtering boxes by attributes
|
|
||||||
on the server, we do the filtering within R to save time and gain flexibility.
|
|
||||||
|
|
||||||
|
|
||||||
So the first step is to retrieve *all the boxes*.
|
|
||||||
|
|
||||||
```{r download, results='hide', message=FALSE, warning=FALSE}
|
|
||||||
# if you want to see results for a specific subset of boxes,
|
|
||||||
# just specify a filter such as grouptag='ifgi' here
|
|
||||||
boxes_all = osem_boxes()
|
|
||||||
boxes = boxes_all
|
|
||||||
```
|
|
||||||
# Introduction
|
|
||||||
In the following we just want to have a look at the boxes created in 2022, so we filter for them.
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
|
|
||||||
summary(boxes) -> summary.data.frame
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
|
|
||||||
<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
|
|
||||||
<!-- outdoors and have around 5 sensors each. -->
|
|
||||||
<!-- The oldest station is from August 2016, while the latest station was registered a -->
|
|
||||||
<!-- couple of minutes ago. -->
|
|
||||||
|
|
||||||
Another feature of interest is the spatial distribution of the boxes: `plot()`
|
|
||||||
can help us out here. This function requires a bunch of optional dependencies though.
|
|
||||||
|
|
||||||
```{r message=F, warning=F}
|
|
||||||
if (!require('maps')) install.packages('maps')
|
|
||||||
if (!require('maptools')) install.packages('maptools')
|
|
||||||
if (!require('rgeos')) install.packages('rgeos')
|
|
||||||
|
|
||||||
plot(boxes)
|
|
||||||
```
|
|
||||||
|
|
||||||
But what do these sensor stations actually measure? Lets find out.
|
|
||||||
`osem_phenomena()` gives us a named list of of the counts of each observed
|
|
||||||
phenomenon for the given set of sensor stations:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
phenoms = osem_phenomena(boxes)
|
|
||||||
str(phenoms)
|
|
||||||
```
|
|
||||||
|
|
||||||
Thats quite some noise there, with many phenomena being measured by a single
|
|
||||||
sensor only, or many duplicated phenomena due to slightly different spellings.
|
|
||||||
We should clean that up, but for now let's just filter out the noise and find
|
|
||||||
those phenomena with high sensor numbers:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
phenoms[phenoms > 50]
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
# Plot count of boxes by time {.tabset}
|
|
||||||
By looking at the `createdAt` attribute of each box we know the exact time a box
|
|
||||||
was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
|
|
||||||
|
|
||||||
With this approach we have no information about boxes that were deleted in the
|
|
||||||
meantime, but that's okay for now.
|
|
||||||
|
|
||||||
## ...and exposure
|
|
||||||
```{r exposure_counts, message=FALSE}
|
|
||||||
exposure_counts = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
mutate(count = row_number(locationtimestamp))
|
|
||||||
|
|
||||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
|
||||||
ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
|
|
||||||
geom_line() +
|
|
||||||
scale_colour_manual(values = exposure_colors) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
```
|
|
||||||
|
|
||||||
Outdoor boxes are growing *fast*!
|
|
||||||
We can also see the introduction of `mobile` sensor "stations" in 2017.
|
|
||||||
|
|
||||||
Let's have a quick summary:
|
|
||||||
```{r exposure_summary}
|
|
||||||
exposure_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(locationtimestamp),
|
|
||||||
newest = max(locationtimestamp),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
```
|
|
||||||
|
|
||||||
## ...and grouptag
|
|
||||||
We can try to find out where the increases in growth came from, by analysing the
|
|
||||||
box count by grouptag.
|
|
||||||
|
|
||||||
Caveats: Only a small subset of boxes has a grouptag, and we should assume
|
|
||||||
that these groups are actually bigger. Also, we can see that grouptag naming is
|
|
||||||
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
|
|
||||||
|
|
||||||
```{r grouptag_counts, message=FALSE}
|
|
||||||
grouptag_counts = boxes %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 15 or more members
|
|
||||||
filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
|
|
||||||
mutate(count = row_number(locationtimestamp))
|
|
||||||
|
|
||||||
# helper for sorting the grouptags by boxcount
|
|
||||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
|
||||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
|
||||||
factor(oldFactor, levels = lvls)
|
|
||||||
}
|
|
||||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
|
||||||
|
|
||||||
ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
|
|
||||||
geom_line(aes(group = grouptag)) +
|
|
||||||
xlab('Registration Date') + ylab('senseBox count')
|
|
||||||
```
|
|
||||||
|
|
||||||
```{r grouptag_summary}
|
|
||||||
grouptag_counts %>%
|
|
||||||
summarise(
|
|
||||||
oldest = min(locationtimestamp),
|
|
||||||
newest = max(locationtimestamp),
|
|
||||||
count = max(count)
|
|
||||||
) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
```
|
|
||||||
|
|
||||||
# Plot rate of growth and inactivity per week
|
|
||||||
First we group the boxes by `locationtimestamp` into bins of one week:
|
|
||||||
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
bins = 'week'
|
|
||||||
mvavg_bins = 6
|
|
||||||
|
|
||||||
growth = boxes %>%
|
|
||||||
mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'registered')
|
|
||||||
```
|
|
||||||
|
|
||||||
We can do the same for `updatedAt`, which informs us about the last change to
|
|
||||||
a box, including uploaded measurements. As a lot of boxes were "updated" by the database
|
|
||||||
migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement`
|
|
||||||
attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes
|
|
||||||
boxes which were created but never made a measurement.
|
|
||||||
|
|
||||||
This method of determining inactive boxes is fairly inaccurate and should be
|
|
||||||
considered an approximation, because we have no information about intermediate
|
|
||||||
inactive phases.
|
|
||||||
Also deleted boxes would probably have a big impact here.
|
|
||||||
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
inactive = boxes %>%
|
|
||||||
# remove boxes that were updated in the last two days,
|
|
||||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
|
||||||
filter(lastMeasurement < now() - days(2)) %>%
|
|
||||||
mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
|
|
||||||
filter(as.Date(week) > as.Date("2021-12-31")) %>%
|
|
||||||
group_by(week) %>%
|
|
||||||
summarize(count = length(week)) %>%
|
|
||||||
mutate(event = 'inactive')
|
|
||||||
```
|
|
||||||
|
|
||||||
Now we can combine both datasets for plotting:
|
|
||||||
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
|
|
||||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
|
||||||
|
|
||||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
|
||||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
|
||||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
|
||||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
|
||||||
geom_point(aes(y = count), size = 0.5) +
|
|
||||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
|
||||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
|
||||||
```
|
|
||||||
|
|
||||||
And see in which weeks the most boxes become (in)active:
|
|
||||||
```{r table_mostregistrations}
|
|
||||||
boxes_by_date %>%
|
|
||||||
filter(count > 50) %>%
|
|
||||||
arrange(desc(count))
|
|
||||||
```
|
|
||||||
|
|
||||||
# Plot duration of boxes being active {.tabset}
|
|
||||||
While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
|
|
||||||
of each box, and look at metrics by exposure and grouptag once more:
|
|
||||||
|
|
||||||
## ...by exposure
|
|
||||||
```{r exposure_duration, message=FALSE}
|
|
||||||
durations = boxes %>%
|
|
||||||
group_by(exposure) %>%
|
|
||||||
filter(!is.na(lastMeasurement)) %>%
|
|
||||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
|
||||||
filter(duration >= 0)
|
|
||||||
|
|
||||||
ggplot(durations, aes(x = exposure, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
```
|
|
||||||
|
|
||||||
The time of activity averages at only `r round(mean(durations$duration))` days,
|
|
||||||
though there are boxes with `r round(max(durations$duration))` days of activity,
|
|
||||||
spanning a large chunk of openSenseMap's existence.
|
|
||||||
|
|
||||||
## ...by grouptag
|
|
||||||
```{r grouptag_duration, message=FALSE}
|
|
||||||
durations = boxes %>%
|
|
||||||
filter(!is.na(lastMeasurement)) %>%
|
|
||||||
group_by(grouptag) %>%
|
|
||||||
# only include grouptags with 20 or more members
|
|
||||||
filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
|
|
||||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
|
||||||
filter(duration >= 0)
|
|
||||||
|
|
||||||
ggplot(durations, aes(x = grouptag, y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days')
|
|
||||||
|
|
||||||
durations %>%
|
|
||||||
summarize(
|
|
||||||
duration_avg = round(mean(duration)),
|
|
||||||
duration_min = round(min(duration)),
|
|
||||||
duration_max = round(max(duration)),
|
|
||||||
oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
|
|
||||||
) %>%
|
|
||||||
arrange(desc(duration_avg))
|
|
||||||
```
|
|
||||||
|
|
||||||
The time of activity averages at only `r round(mean(durations$duration))` days,
|
|
||||||
though there are boxes with `r round(max(durations$duration))` days of activity,
|
|
||||||
spanning a large chunk of openSenseMap's existence.
|
|
||||||
|
|
||||||
## ...by year of registration
|
|
||||||
This is less useful, as older boxes are active for a longer time by definition.
|
|
||||||
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
|
|
||||||
|
|
||||||
```{r year_duration, message=FALSE}
|
|
||||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
|
||||||
duration = boxes %>%
|
|
||||||
mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
|
|
||||||
group_by(year) %>%
|
|
||||||
filter(!is.na(lastMeasurement)) %>%
|
|
||||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
|
||||||
filter(duration >= 0)
|
|
||||||
|
|
||||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
|
||||||
geom_boxplot() +
|
|
||||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
|
||||||
```
|
|
||||||
|
|
||||||
# More Visualisations
|
|
||||||
Other visualisations come to mind, and are left as an exercise to the reader.
|
|
||||||
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
|
|
||||||
|
|
||||||
* growth by phenomenon
|
|
||||||
* growth by location -> (interactive) map
|
|
||||||
* set inactive rate in relation to total box count
|
|
||||||
* filter timespans with big dips in growth rate, and extrapolate the amount of
|
|
||||||
senseBoxes that could be on the platform today, assuming there were no production issues ;)
|
|
||||||
|
|
||||||
[PR]: https://github.com/sensebox/opensensmapr/pulls
|
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
@ -1,73 +0,0 @@
|
|||||||
## ----setup, include=FALSE------------------------------------------------
|
|
||||||
knitr::opts_chunk$set(echo = TRUE)
|
|
||||||
|
|
||||||
## ----results = F---------------------------------------------------------
|
|
||||||
library(magrittr)
|
|
||||||
library(opensensmapr)
|
|
||||||
|
|
||||||
all_sensors = osem_boxes()
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
summary(all_sensors)
|
|
||||||
|
|
||||||
## ----message=F, warning=F------------------------------------------------
|
|
||||||
if (!require('maps')) install.packages('maps')
|
|
||||||
if (!require('maptools')) install.packages('maptools')
|
|
||||||
if (!require('rgeos')) install.packages('rgeos')
|
|
||||||
|
|
||||||
plot(all_sensors)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
phenoms = osem_phenomena(all_sensors)
|
|
||||||
str(phenoms)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
phenoms[phenoms > 20]
|
|
||||||
|
|
||||||
## ----results = F---------------------------------------------------------
|
|
||||||
pm25_sensors = osem_boxes(
|
|
||||||
exposure = 'outdoor',
|
|
||||||
date = Sys.time(), # ±4 hours
|
|
||||||
phenomenon = 'PM2.5'
|
|
||||||
)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
summary(pm25_sensors)
|
|
||||||
plot(pm25_sensors)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
library(sf)
|
|
||||||
library(units)
|
|
||||||
library(lubridate)
|
|
||||||
library(dplyr)
|
|
||||||
|
|
||||||
# construct a bounding box: 12 kilometers around Berlin
|
|
||||||
berlin = st_point(c(13.4034, 52.5120)) %>%
|
|
||||||
st_sfc(crs = 4326) %>%
|
|
||||||
st_transform(3857) %>% # allow setting a buffer in meters
|
|
||||||
st_buffer(set_units(12, km)) %>%
|
|
||||||
st_transform(4326) %>% # the opensensemap expects WGS 84
|
|
||||||
st_bbox()
|
|
||||||
|
|
||||||
## ----results = F---------------------------------------------------------
|
|
||||||
pm25 = osem_measurements(
|
|
||||||
berlin,
|
|
||||||
phenomenon = 'PM2.5',
|
|
||||||
from = now() - days(20), # defaults to 2 days
|
|
||||||
to = now()
|
|
||||||
)
|
|
||||||
|
|
||||||
plot(pm25)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
outliers = filter(pm25, value > 100)$sensorId
|
|
||||||
bad_sensors = outliers[, drop = T] %>% levels()
|
|
||||||
|
|
||||||
pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
|
|
||||||
|
|
||||||
## ------------------------------------------------------------------------
|
|
||||||
pm25 %>% filter(invalid == FALSE) %>% plot()
|
|
||||||
|
|
@ -1,151 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Exploring the openSenseMap Dataset"
|
|
||||||
author: "Norwin Roosen"
|
|
||||||
date: "`r Sys.Date()`"
|
|
||||||
output:
|
|
||||||
rmarkdown::html_vignette:
|
|
||||||
fig_margin: 0
|
|
||||||
fig_width: 6
|
|
||||||
fig_height: 4
|
|
||||||
vignette: >
|
|
||||||
%\VignetteIndexEntry{Exploring the openSenseMap Dataset}
|
|
||||||
%\VignetteEngine{knitr::rmarkdown}
|
|
||||||
%\VignetteEncoding{UTF-8}
|
|
||||||
---
|
|
||||||
|
|
||||||
```{r setup, include=FALSE}
|
|
||||||
knitr::opts_chunk$set(echo = TRUE)
|
|
||||||
```
|
|
||||||
|
|
||||||
This package provides data ingestion functions for almost any data stored on the
|
|
||||||
open data platform for environemental sensordata <https://opensensemap.org>.
|
|
||||||
Its main goals are to provide means for:
|
|
||||||
|
|
||||||
- big data analysis of the measurements stored on the platform
|
|
||||||
- sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
|
|
||||||
|
|
||||||
### Exploring the dataset
|
|
||||||
Before we look at actual observations, lets get a grasp of the openSenseMap
|
|
||||||
datasets' structure.
|
|
||||||
|
|
||||||
```{r results = F}
|
|
||||||
library(magrittr)
|
|
||||||
library(opensensmapr)
|
|
||||||
|
|
||||||
all_sensors = osem_boxes()
|
|
||||||
```
|
|
||||||
```{r}
|
|
||||||
summary(all_sensors)
|
|
||||||
```
|
|
||||||
|
|
||||||
This gives a good overview already: As of writing this, there are more than 700
|
|
||||||
sensor stations, of which ~50% are currently running. Most of them are placed
|
|
||||||
outdoors and have around 5 sensors each.
|
|
||||||
The oldest station is from May 2014, while the latest station was registered a
|
|
||||||
couple of minutes ago.
|
|
||||||
|
|
||||||
Another feature of interest is the spatial distribution of the boxes: `plot()`
|
|
||||||
can help us out here. This function requires a bunch of optional dependencies though.
|
|
||||||
|
|
||||||
```{r message=F, warning=F}
|
|
||||||
if (!require('maps')) install.packages('maps')
|
|
||||||
if (!require('maptools')) install.packages('maptools')
|
|
||||||
if (!require('rgeos')) install.packages('rgeos')
|
|
||||||
|
|
||||||
plot(all_sensors)
|
|
||||||
```
|
|
||||||
|
|
||||||
It seems we have to reduce our area of interest to Germany.
|
|
||||||
|
|
||||||
But what do these sensor stations actually measure? Lets find out.
|
|
||||||
`osem_phenomena()` gives us a named list of of the counts of each observed
|
|
||||||
phenomenon for the given set of sensor stations:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
phenoms = osem_phenomena(all_sensors)
|
|
||||||
str(phenoms)
|
|
||||||
```
|
|
||||||
|
|
||||||
Thats quite some noise there, with many phenomena being measured by a single
|
|
||||||
sensor only, or many duplicated phenomena due to slightly different spellings.
|
|
||||||
We should clean that up, but for now let's just filter out the noise and find
|
|
||||||
those phenomena with high sensor numbers:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
phenoms[phenoms > 20]
|
|
||||||
```
|
|
||||||
|
|
||||||
Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
|
|
||||||
interesting to analyze though.
|
|
||||||
We should check how many sensor stations provide useful data: We want only those
|
|
||||||
boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
|
|
||||||
measurements:
|
|
||||||
|
|
||||||
```{r results = F}
|
|
||||||
pm25_sensors = osem_boxes(
|
|
||||||
exposure = 'outdoor',
|
|
||||||
date = Sys.time(), # ±4 hours
|
|
||||||
phenomenon = 'PM2.5'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
```{r}
|
|
||||||
summary(pm25_sensors)
|
|
||||||
plot(pm25_sensors)
|
|
||||||
```
|
|
||||||
|
|
||||||
Thats still more than 200 measuring stations, we can work with that.
|
|
||||||
|
|
||||||
### Analyzing sensor data
|
|
||||||
Having analyzed the available data sources, let's finally get some measurements.
|
|
||||||
We could call `osem_measurements(pm25_sensors)` now, however we are focussing on
|
|
||||||
a restricted area of interest, the city of Berlin.
|
|
||||||
Luckily we can get the measurements filtered by a bounding box:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
library(sf)
|
|
||||||
library(units)
|
|
||||||
library(lubridate)
|
|
||||||
library(dplyr)
|
|
||||||
|
|
||||||
# construct a bounding box: 12 kilometers around Berlin
|
|
||||||
berlin = st_point(c(13.4034, 52.5120)) %>%
|
|
||||||
st_sfc(crs = 4326) %>%
|
|
||||||
st_transform(3857) %>% # allow setting a buffer in meters
|
|
||||||
st_buffer(set_units(12, km)) %>%
|
|
||||||
st_transform(4326) %>% # the opensensemap expects WGS 84
|
|
||||||
st_bbox()
|
|
||||||
```
|
|
||||||
```{r results = F}
|
|
||||||
pm25 = osem_measurements(
|
|
||||||
berlin,
|
|
||||||
phenomenon = 'PM2.5',
|
|
||||||
from = now() - days(20), # defaults to 2 days
|
|
||||||
to = now()
|
|
||||||
)
|
|
||||||
|
|
||||||
plot(pm25)
|
|
||||||
```
|
|
||||||
|
|
||||||
Now we can get started with actual spatiotemporal data analysis.
|
|
||||||
First, lets mask the seemingly uncalibrated sensors:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
outliers = filter(pm25, value > 100)$sensorId
|
|
||||||
bad_sensors = outliers[, drop = T] %>% levels()
|
|
||||||
|
|
||||||
pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
|
|
||||||
```
|
|
||||||
|
|
||||||
Then plot the measuring locations, flagging the outliers:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
|
|
||||||
```
|
|
||||||
|
|
||||||
Removing these sensors yields a nicer time series plot:
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
pm25 %>% filter(invalid == FALSE) %>% plot()
|
|
||||||
```
|
|
||||||
|
|
||||||
Further analysis: comparison with LANUV data `TODO`
|
|
File diff suppressed because one or more lines are too long
@ -1,51 +0,0 @@
|
|||||||
## ----setup, results='hide'-----------------------------------------------
|
|
||||||
# this vignette requires:
|
|
||||||
library(opensensmapr)
|
|
||||||
library(jsonlite)
|
|
||||||
library(readr)
|
|
||||||
|
|
||||||
## ----cache---------------------------------------------------------------
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
|
||||||
|
|
||||||
# the next identical request will hit the cache only!
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
|
||||||
|
|
||||||
# requests without the cache parameter will still be performed normally
|
|
||||||
b = osem_boxes(grouptag = 'ifgi')
|
|
||||||
|
|
||||||
## ----cachelisting--------------------------------------------------------
|
|
||||||
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
|
|
||||||
|
|
||||||
## ----cache_custom--------------------------------------------------------
|
|
||||||
cacheDir = getwd() # current working directory
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
|
||||||
|
|
||||||
# the next identical request will hit the cache only!
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
|
||||||
|
|
||||||
## ----clearcache----------------------------------------------------------
|
|
||||||
osem_clear_cache() # clears default cache
|
|
||||||
osem_clear_cache(getwd()) # clears a custom cache
|
|
||||||
|
|
||||||
## ----data, results='hide'------------------------------------------------
|
|
||||||
# first get our example data:
|
|
||||||
measurements = osem_measurements('Windrichtung')
|
|
||||||
|
|
||||||
## ----serialize_json------------------------------------------------------
|
|
||||||
# serializing senseBoxes to JSON, and loading from file again:
|
|
||||||
write(jsonlite::serializeJSON(measurements), 'measurements.json')
|
|
||||||
measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
|
|
||||||
class(measurements_from_file)
|
|
||||||
|
|
||||||
## ----serialize_attrs-----------------------------------------------------
|
|
||||||
# note the toJSON call instead of serializeJSON
|
|
||||||
write(jsonlite::toJSON(measurements), 'measurements_bad.json')
|
|
||||||
measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
|
|
||||||
class(measurements_without_attrs)
|
|
||||||
|
|
||||||
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
|
|
||||||
class(measurements_with_attrs)
|
|
||||||
|
|
||||||
## ----cleanup, include=FALSE----------------------------------------------
|
|
||||||
file.remove('measurements.json', 'measurements_bad.json')
|
|
||||||
|
|
@ -1,106 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Caching openSenseMap Data for Reproducibility"
|
|
||||||
author: "Norwin Roosen"
|
|
||||||
date: "`r Sys.Date()`"
|
|
||||||
output: rmarkdown::html_vignette
|
|
||||||
vignette: >
|
|
||||||
%\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility}
|
|
||||||
%\VignetteEngine{knitr::rmarkdown}
|
|
||||||
%\VignetteEncoding{UTF-8}
|
|
||||||
---
|
|
||||||
|
|
||||||
It may be useful to download data from openSenseMap only once.
|
|
||||||
For reproducible results, the data should be saved to disk, and reloaded at a
|
|
||||||
later point.
|
|
||||||
|
|
||||||
This avoids..
|
|
||||||
|
|
||||||
- changed results for queries without date parameters,
|
|
||||||
- unnecessary wait times,
|
|
||||||
- risk of API changes / API unavailability,
|
|
||||||
- stress on the openSenseMap-server.
|
|
||||||
|
|
||||||
This vignette shows how to use this built in `opensensmapr` feature, and
|
|
||||||
how to do it yourself in case you want to save to other data formats.
|
|
||||||
|
|
||||||
```{r setup, results='hide'}
|
|
||||||
# this vignette requires:
|
|
||||||
library(opensensmapr)
|
|
||||||
library(jsonlite)
|
|
||||||
library(readr)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Using the opensensmapr Caching Feature
|
|
||||||
All data retrieval functions of `opensensmapr` have a built in caching feature,
|
|
||||||
which serializes an API response to disk.
|
|
||||||
Subsequent identical requests will then return the serialized data instead of making
|
|
||||||
another request.
|
|
||||||
|
|
||||||
To use this feature, just add a path to a directory to the `cache` parameter:
|
|
||||||
```{r cache}
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
|
||||||
|
|
||||||
# the next identical request will hit the cache only!
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
|
||||||
|
|
||||||
# requests without the cache parameter will still be performed normally
|
|
||||||
b = osem_boxes(grouptag = 'ifgi')
|
|
||||||
```
|
|
||||||
|
|
||||||
Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
|
|
||||||
```{r cachelisting}
|
|
||||||
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
|
|
||||||
```
|
|
||||||
|
|
||||||
You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
|
|
||||||
```{r cache_custom}
|
|
||||||
cacheDir = getwd() # current working directory
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
|
||||||
|
|
||||||
# the next identical request will hit the cache only!
|
|
||||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
|
||||||
```
|
|
||||||
|
|
||||||
To get fresh results again, just call `osem_clear_cache()` for the respective cache:
|
|
||||||
```{r clearcache, results='hide'}
|
|
||||||
osem_clear_cache() # clears default cache
|
|
||||||
osem_clear_cache(getwd()) # clears a custom cache
|
|
||||||
```
|
|
||||||
|
|
||||||
## Custom (De-) Serialization
|
|
||||||
If you want to roll your own serialization method to support custom data formats,
|
|
||||||
here's how:
|
|
||||||
|
|
||||||
```{r data, results='hide'}
|
|
||||||
# first get our example data:
|
|
||||||
measurements = osem_measurements('Windrichtung')
|
|
||||||
```
|
|
||||||
|
|
||||||
If you are paranoid and worry about `.rds` files not being decodable anymore
|
|
||||||
in the (distant) future, you could serialize to a plain text format such as JSON.
|
|
||||||
This of course comes at the cost of storage space and performance.
|
|
||||||
```{r serialize_json}
|
|
||||||
# serializing senseBoxes to JSON, and loading from file again:
|
|
||||||
write(jsonlite::serializeJSON(measurements), 'measurements.json')
|
|
||||||
measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
|
|
||||||
class(measurements_from_file)
|
|
||||||
```
|
|
||||||
|
|
||||||
This method also persists the R object metadata (classes, attributes).
|
|
||||||
If you were to use a serialization method that can't persist object metadata, you
|
|
||||||
could re-apply it with the following functions:
|
|
||||||
|
|
||||||
```{r serialize_attrs}
|
|
||||||
# note the toJSON call instead of serializeJSON
|
|
||||||
write(jsonlite::toJSON(measurements), 'measurements_bad.json')
|
|
||||||
measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
|
|
||||||
class(measurements_without_attrs)
|
|
||||||
|
|
||||||
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
|
|
||||||
class(measurements_with_attrs)
|
|
||||||
```
|
|
||||||
The same goes for boxes via `osem_as_sensebox()`.
|
|
||||||
|
|
||||||
```{r cleanup, include=FALSE}
|
|
||||||
file.remove('measurements.json', 'measurements_bad.json')
|
|
||||||
```
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue