mirror of
https://github.com/sensebox/opensensmapr
synced 2025-02-18 17:23:57 +01:00
add inst/doc
This commit is contained in:
parent
62667ef139
commit
37d4dde1d6
12 changed files with 6822 additions and 0 deletions
133
inst/doc/osem-history.R
Normal file
133
inst/doc/osem-history.R
Normal file
|
@ -0,0 +1,133 @@
|
|||
## ----setup, results='hide', message=FALSE, warning=FALSE----------------------
|
||||
# required packages:
|
||||
library(opensensmapr) # data download
|
||||
library(dplyr) # data wrangling
|
||||
library(ggplot2) # plotting
|
||||
library(lubridate) # date arithmetic
|
||||
library(zoo) # rollmean()
|
||||
|
||||
## ----download-----------------------------------------------------------------
|
||||
# if you want to see results for a specific subset of boxes,
|
||||
# just specify a filter such as grouptag='ifgi' here
|
||||
boxes = osem_boxes()
|
||||
|
||||
## ----exposure_counts, message=FALSE-------------------------------------------
|
||||
exposure_counts = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
mutate(count = row_number(createdAt))
|
||||
|
||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
||||
ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
|
||||
geom_line() +
|
||||
scale_colour_manual(values = exposure_colors) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
|
||||
## ----exposure_summary---------------------------------------------------------
|
||||
exposure_counts %>%
|
||||
summarise(
|
||||
oldest = min(createdAt),
|
||||
newest = max(createdAt),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
|
||||
## ----grouptag_counts, message=FALSE-------------------------------------------
|
||||
grouptag_counts = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 8 or more members
|
||||
filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
|
||||
mutate(count = row_number(createdAt))
|
||||
|
||||
# helper for sorting the grouptags by boxcount
|
||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
||||
factor(oldFactor, levels = lvls)
|
||||
}
|
||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
||||
|
||||
ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
|
||||
geom_line(aes(group = grouptag)) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
|
||||
## ----grouptag_summary---------------------------------------------------------
|
||||
grouptag_counts %>%
|
||||
summarise(
|
||||
oldest = min(createdAt),
|
||||
newest = max(createdAt),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
|
||||
## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------
|
||||
bins = 'week'
|
||||
mvavg_bins = 6
|
||||
|
||||
growth = boxes %>%
|
||||
mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'registered')
|
||||
|
||||
## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'--------
|
||||
inactive = boxes %>%
|
||||
# remove boxes that were updated in the last two days,
|
||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
||||
filter(updatedAt < now() - days(2)) %>%
|
||||
mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'inactive')
|
||||
|
||||
## ----growthrate, warning=FALSE, message=FALSE, results='hide'-----------------
|
||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
||||
|
||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
||||
geom_point(aes(y = count), size = 0.5) +
|
||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
||||
|
||||
## ----exposure_duration, message=FALSE-----------------------------------------
|
||||
duration = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
filter(!is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = exposure, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
## ----grouptag_duration, message=FALSE-----------------------------------------
|
||||
duration = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 8 or more members
|
||||
filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = grouptag, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
duration %>%
|
||||
summarize(
|
||||
duration_avg = round(mean(duration)),
|
||||
duration_min = round(min(duration)),
|
||||
duration_max = round(max(duration)),
|
||||
oldest_box = round(max(difftime(now(), createdAt, units='days')))
|
||||
) %>%
|
||||
arrange(desc(duration_avg))
|
||||
|
||||
## ----year_duration, message=FALSE---------------------------------------------
|
||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
||||
duration = boxes %>%
|
||||
mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
|
||||
group_by(year) %>%
|
||||
filter(!is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
||||
|
243
inst/doc/osem-history.Rmd
Normal file
243
inst/doc/osem-history.Rmd
Normal file
|
@ -0,0 +1,243 @@
|
|||
---
|
||||
title: "Visualising the History of openSenseMap.org"
|
||||
author: "Norwin Roosen"
|
||||
date: '`r Sys.Date()`'
|
||||
output:
|
||||
rmarkdown::html_vignette:
|
||||
df_print: kable
|
||||
fig_height: 5
|
||||
fig_width: 7
|
||||
toc: yes
|
||||
html_document:
|
||||
code_folding: hide
|
||||
df_print: kable
|
||||
theme: lumen
|
||||
toc: yes
|
||||
toc_float: yes
|
||||
vignette: >
|
||||
%\VignetteIndexEntry{Visualising the History of openSenseMap.org}
|
||||
%\VignetteEngine{knitr::rmarkdown}
|
||||
%\VignetteEncoding{UTF-8}
|
||||
---
|
||||
|
||||
> This vignette serves as an example on data wrangling & visualization with
|
||||
`opensensmapr`, `dplyr` and `ggplot2`.
|
||||
|
||||
```{r setup, results='hide', message=FALSE, warning=FALSE}
|
||||
# required packages:
|
||||
library(opensensmapr) # data download
|
||||
library(dplyr) # data wrangling
|
||||
library(ggplot2) # plotting
|
||||
library(lubridate) # date arithmetic
|
||||
library(zoo) # rollmean()
|
||||
```
|
||||
|
||||
openSenseMap.org has grown quite a bit in the last years; it would be interesting
|
||||
to see how we got to the current `r osem_counts()$boxes` sensor stations,
|
||||
split up by various attributes of the boxes.
|
||||
|
||||
While `opensensmapr` provides extensive methods of filtering boxes by attributes
|
||||
on the server, we do the filtering within R to save time and gain flexibility.
|
||||
So the first step is to retrieve *all the boxes*:
|
||||
|
||||
```{r download}
|
||||
# if you want to see results for a specific subset of boxes,
|
||||
# just specify a filter such as grouptag='ifgi' here
|
||||
boxes = osem_boxes()
|
||||
```
|
||||
|
||||
# Plot count of boxes by time {.tabset}
|
||||
By looking at the `createdAt` attribute of each box we know the exact time a box
|
||||
was registered.
|
||||
With this approach we have no information about boxes that were deleted in the
|
||||
meantime, but that's okay for now.
|
||||
|
||||
## ...and exposure
|
||||
```{r exposure_counts, message=FALSE}
|
||||
exposure_counts = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
mutate(count = row_number(createdAt))
|
||||
|
||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
||||
ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
|
||||
geom_line() +
|
||||
scale_colour_manual(values = exposure_colors) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
```
|
||||
|
||||
Outdoor boxes are growing *fast*!
|
||||
We can also see the introduction of `mobile` sensor "stations" in 2017. While
|
||||
mobile boxes are still few, we can expect a quick rise in 2018 once the new
|
||||
senseBox MCU with GPS support is released.
|
||||
|
||||
Let's have a quick summary:
|
||||
```{r exposure_summary}
|
||||
exposure_counts %>%
|
||||
summarise(
|
||||
oldest = min(createdAt),
|
||||
newest = max(createdAt),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
```
|
||||
|
||||
## ...and grouptag
|
||||
We can try to find out where the increases in growth came from, by analysing the
|
||||
box count by grouptag.
|
||||
|
||||
Caveats: Only a small subset of boxes has a grouptag, and we should assume
|
||||
that these groups are actually bigger. Also, we can see that grouptag naming is
|
||||
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
|
||||
|
||||
```{r grouptag_counts, message=FALSE}
|
||||
grouptag_counts = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 8 or more members
|
||||
filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
|
||||
mutate(count = row_number(createdAt))
|
||||
|
||||
# helper for sorting the grouptags by boxcount
|
||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
||||
factor(oldFactor, levels = lvls)
|
||||
}
|
||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
||||
|
||||
ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
|
||||
geom_line(aes(group = grouptag)) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
```
|
||||
|
||||
```{r grouptag_summary}
|
||||
grouptag_counts %>%
|
||||
summarise(
|
||||
oldest = min(createdAt),
|
||||
newest = max(createdAt),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
```
|
||||
|
||||
# Plot rate of growth and inactivity per week
|
||||
First we group the boxes by `createdAt` into bins of one week:
|
||||
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
|
||||
bins = 'week'
|
||||
mvavg_bins = 6
|
||||
|
||||
growth = boxes %>%
|
||||
mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'registered')
|
||||
```
|
||||
|
||||
We can do the same for `updatedAt`, which informs us about the last change to
|
||||
a box, including uploaded measurements.
|
||||
This method of determining inactive boxes is fairly inaccurate and should be
|
||||
considered an approximation, because we have no information about intermediate
|
||||
inactive phases.
|
||||
Also deleted boxes would probably have a big impact here.
|
||||
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
|
||||
inactive = boxes %>%
|
||||
# remove boxes that were updated in the last two days,
|
||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
||||
filter(updatedAt < now() - days(2)) %>%
|
||||
mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'inactive')
|
||||
```
|
||||
|
||||
Now we can combine both datasets for plotting:
|
||||
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
|
||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
||||
|
||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
||||
geom_point(aes(y = count), size = 0.5) +
|
||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
||||
```
|
||||
|
||||
We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
|
||||
This was enabled by an integration of openSenseMap.org into the firmware of the
|
||||
air quality monitoring project [luftdaten.info](https://sensor.community/de/).
|
||||
The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
|
||||
of the senseBox hardware, but I have no data on the exact time frames to verify.
|
||||
|
||||
# Plot duration of boxes being active {.tabset}
|
||||
While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity
|
||||
of each box, and look at metrics by exposure and grouptag once more:
|
||||
|
||||
## ...by exposure
|
||||
```{r exposure_duration, message=FALSE}
|
||||
duration = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
filter(!is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = exposure, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
```
|
||||
|
||||
The time of activity averages at only `r round(mean(duration$duration))` days,
|
||||
though there are boxes with `r round(max(duration$duration))` days of activity,
|
||||
spanning a large chunk of openSenseMap's existence.
|
||||
|
||||
## ...by grouptag
|
||||
```{r grouptag_duration, message=FALSE}
|
||||
duration = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 8 or more members
|
||||
filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = grouptag, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
duration %>%
|
||||
summarize(
|
||||
duration_avg = round(mean(duration)),
|
||||
duration_min = round(min(duration)),
|
||||
duration_max = round(max(duration)),
|
||||
oldest_box = round(max(difftime(now(), createdAt, units='days')))
|
||||
) %>%
|
||||
arrange(desc(duration_avg))
|
||||
```
|
||||
|
||||
The time of activity averages at only `r round(mean(duration$duration))` days,
|
||||
though there are boxes with `r round(max(duration$duration))` days of activity,
|
||||
spanning a large chunk of openSenseMap's existence.
|
||||
|
||||
## ...by year of registration
|
||||
This is less useful, as older boxes are active for a longer time by definition.
|
||||
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
|
||||
|
||||
```{r year_duration, message=FALSE}
|
||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
||||
duration = boxes %>%
|
||||
mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
|
||||
group_by(year) %>%
|
||||
filter(!is.na(updatedAt)) %>%
|
||||
mutate(duration = difftime(updatedAt, createdAt, units='days'))
|
||||
|
||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
||||
```
|
||||
|
||||
# More Visualisations
|
||||
Other visualisations come to mind, and are left as an exercise to the reader.
|
||||
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
|
||||
|
||||
* growth by phenomenon
|
||||
* growth by location -> (interactive) map
|
||||
* set inactive rate in relation to total box count
|
||||
* filter timespans with big dips in growth rate, and extrapolate the amount of
|
||||
senseBoxes that could be on the platform today, assuming there were no production issues ;)
|
||||
|
||||
[PR]: https://github.com/sensebox/opensensmapr/pulls
|
1818
inst/doc/osem-history.html
Normal file
1818
inst/doc/osem-history.html
Normal file
File diff suppressed because one or more lines are too long
162
inst/doc/osem-history_revised.R
Normal file
162
inst/doc/osem-history_revised.R
Normal file
|
@ -0,0 +1,162 @@
|
|||
## ----setup, results='hide', message=FALSE, warning=FALSE----------------------
|
||||
# required packages:
|
||||
library(opensensmapr) # data download
|
||||
library(dplyr) # data wrangling
|
||||
library(ggplot2) # plotting
|
||||
library(lubridate) # date arithmetic
|
||||
library(zoo) # rollmean()
|
||||
|
||||
## ----download, results='hide', message=FALSE, warning=FALSE-------------------
|
||||
# if you want to see results for a specific subset of boxes,
|
||||
# just specify a filter such as grouptag='ifgi' here
|
||||
boxes_all = osem_boxes()
|
||||
boxes = boxes_all
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
|
||||
summary(boxes) -> summary.data.frame
|
||||
|
||||
## ----message=F, warning=F-----------------------------------------------------
|
||||
if (!require('maps')) install.packages('maps')
|
||||
if (!require('maptools')) install.packages('maptools')
|
||||
if (!require('rgeos')) install.packages('rgeos')
|
||||
|
||||
plot(boxes)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
phenoms = osem_phenomena(boxes)
|
||||
str(phenoms)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
phenoms[phenoms > 50]
|
||||
|
||||
## ----exposure_counts, message=FALSE-------------------------------------------
|
||||
exposure_counts = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
mutate(count = row_number(locationtimestamp))
|
||||
|
||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
||||
ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
|
||||
geom_line() +
|
||||
scale_colour_manual(values = exposure_colors) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
|
||||
## ----exposure_summary---------------------------------------------------------
|
||||
exposure_counts %>%
|
||||
summarise(
|
||||
oldest = min(locationtimestamp),
|
||||
newest = max(locationtimestamp),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
|
||||
## ----grouptag_counts, message=FALSE-------------------------------------------
|
||||
grouptag_counts = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 15 or more members
|
||||
filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
|
||||
mutate(count = row_number(locationtimestamp))
|
||||
|
||||
# helper for sorting the grouptags by boxcount
|
||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
||||
factor(oldFactor, levels = lvls)
|
||||
}
|
||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
||||
|
||||
ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
|
||||
geom_line(aes(group = grouptag)) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
|
||||
## ----grouptag_summary---------------------------------------------------------
|
||||
grouptag_counts %>%
|
||||
summarise(
|
||||
oldest = min(locationtimestamp),
|
||||
newest = max(locationtimestamp),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
|
||||
## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------
|
||||
bins = 'week'
|
||||
mvavg_bins = 6
|
||||
|
||||
growth = boxes %>%
|
||||
mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'registered')
|
||||
|
||||
## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'--------
|
||||
inactive = boxes %>%
|
||||
# remove boxes that were updated in the last two days,
|
||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
||||
filter(lastMeasurement < now() - days(2)) %>%
|
||||
mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
|
||||
filter(as.Date(week) > as.Date("2021-12-31")) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'inactive')
|
||||
|
||||
## ----growthrate, warning=FALSE, message=FALSE, results='hide'-----------------
|
||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
||||
|
||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
||||
geom_point(aes(y = count), size = 0.5) +
|
||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
||||
|
||||
## ----table_mostregistrations--------------------------------------------------
|
||||
boxes_by_date %>%
|
||||
filter(count > 50) %>%
|
||||
arrange(desc(count))
|
||||
|
||||
## ----exposure_duration, message=FALSE-----------------------------------------
|
||||
durations = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(durations, aes(x = exposure, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
## ----grouptag_duration, message=FALSE-----------------------------------------
|
||||
durations = boxes %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 20 or more members
|
||||
filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(durations, aes(x = grouptag, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
durations %>%
|
||||
summarize(
|
||||
duration_avg = round(mean(duration)),
|
||||
duration_min = round(min(duration)),
|
||||
duration_max = round(max(duration)),
|
||||
oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
|
||||
) %>%
|
||||
arrange(desc(duration_avg))
|
||||
|
||||
## ----year_duration, message=FALSE---------------------------------------------
|
||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
||||
duration = boxes %>%
|
||||
mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
|
||||
group_by(year) %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
||||
|
300
inst/doc/osem-history_revised.Rmd
Normal file
300
inst/doc/osem-history_revised.Rmd
Normal file
|
@ -0,0 +1,300 @@
|
|||
---
|
||||
title: "Visualising the Development of openSenseMap.org in 2022"
|
||||
author: "Jan Stenkamp"
|
||||
date: '`r Sys.Date()`'
|
||||
output:
|
||||
html_document:
|
||||
code_folding: hide
|
||||
df_print: kable
|
||||
theme: lumen
|
||||
toc: yes
|
||||
toc_float: yes
|
||||
rmarkdown::html_vignette:
|
||||
df_print: kable
|
||||
fig_height: 5
|
||||
fig_width: 7
|
||||
toc: yes
|
||||
vignette: >
|
||||
%\VignetteIndexEntry{Visualising the Development of openSenseMap.org in 2022}
|
||||
%\VignetteEncoding{UTF-8}
|
||||
%\VignetteEngine{knitr::rmarkdown}
|
||||
---
|
||||
|
||||
> This vignette serves as an example on data wrangling & visualization with
|
||||
`opensensmapr`, `dplyr` and `ggplot2`.
|
||||
|
||||
```{r setup, results='hide', message=FALSE, warning=FALSE}
|
||||
# required packages:
|
||||
library(opensensmapr) # data download
|
||||
library(dplyr) # data wrangling
|
||||
library(ggplot2) # plotting
|
||||
library(lubridate) # date arithmetic
|
||||
library(zoo) # rollmean()
|
||||
```
|
||||
|
||||
openSenseMap.org has grown quite a bit in the last years; it would be interesting
|
||||
to see how we got to the current `r osem_counts()$boxes` sensor stations,
|
||||
split up by various attributes of the boxes.
|
||||
|
||||
While `opensensmapr` provides extensive methods of filtering boxes by attributes
|
||||
on the server, we do the filtering within R to save time and gain flexibility.
|
||||
|
||||
|
||||
So the first step is to retrieve *all the boxes*.
|
||||
|
||||
```{r download, results='hide', message=FALSE, warning=FALSE}
|
||||
# if you want to see results for a specific subset of boxes,
|
||||
# just specify a filter such as grouptag='ifgi' here
|
||||
boxes_all = osem_boxes()
|
||||
boxes = boxes_all
|
||||
```
|
||||
# Introduction
|
||||
In the following we just want to have a look at the boxes created in 2022, so we filter for them.
|
||||
|
||||
```{r}
|
||||
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
|
||||
summary(boxes) -> summary.data.frame
|
||||
```
|
||||
|
||||
<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
|
||||
<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
|
||||
<!-- outdoors and have around 5 sensors each. -->
|
||||
<!-- The oldest station is from August 2016, while the latest station was registered a -->
|
||||
<!-- couple of minutes ago. -->
|
||||
|
||||
Another feature of interest is the spatial distribution of the boxes: `plot()`
|
||||
can help us out here. This function requires a bunch of optional dependencies though.
|
||||
|
||||
```{r message=F, warning=F}
|
||||
if (!require('maps')) install.packages('maps')
|
||||
if (!require('maptools')) install.packages('maptools')
|
||||
if (!require('rgeos')) install.packages('rgeos')
|
||||
|
||||
plot(boxes)
|
||||
```
|
||||
|
||||
But what do these sensor stations actually measure? Lets find out.
|
||||
`osem_phenomena()` gives us a named list of of the counts of each observed
|
||||
phenomenon for the given set of sensor stations:
|
||||
|
||||
```{r}
|
||||
phenoms = osem_phenomena(boxes)
|
||||
str(phenoms)
|
||||
```
|
||||
|
||||
Thats quite some noise there, with many phenomena being measured by a single
|
||||
sensor only, or many duplicated phenomena due to slightly different spellings.
|
||||
We should clean that up, but for now let's just filter out the noise and find
|
||||
those phenomena with high sensor numbers:
|
||||
|
||||
```{r}
|
||||
phenoms[phenoms > 50]
|
||||
```
|
||||
|
||||
|
||||
# Plot count of boxes by time {.tabset}
|
||||
By looking at the `createdAt` attribute of each box we know the exact time a box
|
||||
was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
|
||||
|
||||
With this approach we have no information about boxes that were deleted in the
|
||||
meantime, but that's okay for now.
|
||||
|
||||
## ...and exposure
|
||||
```{r exposure_counts, message=FALSE}
|
||||
exposure_counts = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
mutate(count = row_number(locationtimestamp))
|
||||
|
||||
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
|
||||
ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
|
||||
geom_line() +
|
||||
scale_colour_manual(values = exposure_colors) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
```
|
||||
|
||||
Outdoor boxes are growing *fast*!
|
||||
We can also see the introduction of `mobile` sensor "stations" in 2017.
|
||||
|
||||
Let's have a quick summary:
|
||||
```{r exposure_summary}
|
||||
exposure_counts %>%
|
||||
summarise(
|
||||
oldest = min(locationtimestamp),
|
||||
newest = max(locationtimestamp),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
```
|
||||
|
||||
## ...and grouptag
|
||||
We can try to find out where the increases in growth came from, by analysing the
|
||||
box count by grouptag.
|
||||
|
||||
Caveats: Only a small subset of boxes has a grouptag, and we should assume
|
||||
that these groups are actually bigger. Also, we can see that grouptag naming is
|
||||
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
|
||||
|
||||
```{r grouptag_counts, message=FALSE}
|
||||
grouptag_counts = boxes %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 15 or more members
|
||||
filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
|
||||
mutate(count = row_number(locationtimestamp))
|
||||
|
||||
# helper for sorting the grouptags by boxcount
|
||||
sortLvls = function(oldFactor, ascending = TRUE) {
|
||||
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
|
||||
factor(oldFactor, levels = lvls)
|
||||
}
|
||||
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
|
||||
|
||||
ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
|
||||
geom_line(aes(group = grouptag)) +
|
||||
xlab('Registration Date') + ylab('senseBox count')
|
||||
```
|
||||
|
||||
```{r grouptag_summary}
|
||||
grouptag_counts %>%
|
||||
summarise(
|
||||
oldest = min(locationtimestamp),
|
||||
newest = max(locationtimestamp),
|
||||
count = max(count)
|
||||
) %>%
|
||||
arrange(desc(count))
|
||||
```
|
||||
|
||||
# Plot rate of growth and inactivity per week
|
||||
First we group the boxes by `locationtimestamp` into bins of one week:
|
||||
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
|
||||
bins = 'week'
|
||||
mvavg_bins = 6
|
||||
|
||||
growth = boxes %>%
|
||||
mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'registered')
|
||||
```
|
||||
|
||||
We can do the same for `updatedAt`, which informs us about the last change to
|
||||
a box, including uploaded measurements. As a lot of boxes were "updated" by the database
|
||||
migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement`
|
||||
attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes
|
||||
boxes which were created but never made a measurement.
|
||||
|
||||
This method of determining inactive boxes is fairly inaccurate and should be
|
||||
considered an approximation, because we have no information about intermediate
|
||||
inactive phases.
|
||||
Also deleted boxes would probably have a big impact here.
|
||||
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
|
||||
inactive = boxes %>%
|
||||
# remove boxes that were updated in the last two days,
|
||||
# b/c any box becomes inactive at some point by definition of updatedAt
|
||||
filter(lastMeasurement < now() - days(2)) %>%
|
||||
mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
|
||||
filter(as.Date(week) > as.Date("2021-12-31")) %>%
|
||||
group_by(week) %>%
|
||||
summarize(count = length(week)) %>%
|
||||
mutate(event = 'inactive')
|
||||
```
|
||||
|
||||
Now we can combine both datasets for plotting:
|
||||
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
|
||||
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
|
||||
|
||||
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
|
||||
xlab('Time') + ylab(paste('rate per ', bins)) +
|
||||
scale_x_date(date_breaks="years", date_labels="%Y") +
|
||||
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
|
||||
geom_point(aes(y = count), size = 0.5) +
|
||||
# moving average, make first and last value NA (to ensure identical length of vectors)
|
||||
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
|
||||
```
|
||||
|
||||
And see in which weeks the most boxes become (in)active:
|
||||
```{r table_mostregistrations}
|
||||
boxes_by_date %>%
|
||||
filter(count > 50) %>%
|
||||
arrange(desc(count))
|
||||
```
|
||||
|
||||
# Plot duration of boxes being active {.tabset}
|
||||
While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
|
||||
of each box, and look at metrics by exposure and grouptag once more:
|
||||
|
||||
## ...by exposure
|
||||
```{r exposure_duration, message=FALSE}
|
||||
durations = boxes %>%
|
||||
group_by(exposure) %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(durations, aes(x = exposure, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
```
|
||||
|
||||
The time of activity averages at only `r round(mean(durations$duration))` days,
|
||||
though there are boxes with `r round(max(durations$duration))` days of activity,
|
||||
spanning a large chunk of openSenseMap's existence.
|
||||
|
||||
## ...by grouptag
|
||||
```{r grouptag_duration, message=FALSE}
|
||||
durations = boxes %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
group_by(grouptag) %>%
|
||||
# only include grouptags with 20 or more members
|
||||
filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(durations, aes(x = grouptag, y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days')
|
||||
|
||||
durations %>%
|
||||
summarize(
|
||||
duration_avg = round(mean(duration)),
|
||||
duration_min = round(min(duration)),
|
||||
duration_max = round(max(duration)),
|
||||
oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
|
||||
) %>%
|
||||
arrange(desc(duration_avg))
|
||||
```
|
||||
|
||||
The time of activity averages at only `r round(mean(durations$duration))` days,
|
||||
though there are boxes with `r round(max(durations$duration))` days of activity,
|
||||
spanning a large chunk of openSenseMap's existence.
|
||||
|
||||
## ...by year of registration
|
||||
This is less useful, as older boxes are active for a longer time by definition.
|
||||
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
|
||||
|
||||
```{r year_duration, message=FALSE}
|
||||
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
|
||||
duration = boxes %>%
|
||||
mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
|
||||
group_by(year) %>%
|
||||
filter(!is.na(lastMeasurement)) %>%
|
||||
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
|
||||
filter(duration >= 0)
|
||||
|
||||
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
|
||||
geom_boxplot() +
|
||||
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
|
||||
```
|
||||
|
||||
# More Visualisations
|
||||
Other visualisations come to mind, and are left as an exercise to the reader.
|
||||
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
|
||||
|
||||
* growth by phenomenon
|
||||
* growth by location -> (interactive) map
|
||||
* set inactive rate in relation to total box count
|
||||
* filter timespans with big dips in growth rate, and extrapolate the amount of
|
||||
senseBoxes that could be on the platform today, assuming there were no production issues ;)
|
||||
|
||||
[PR]: https://github.com/sensebox/opensensmapr/pulls
|
||||
|
||||
|
2471
inst/doc/osem-history_revised.html
Normal file
2471
inst/doc/osem-history_revised.html
Normal file
File diff suppressed because one or more lines are too long
73
inst/doc/osem-intro.R
Normal file
73
inst/doc/osem-intro.R
Normal file
|
@ -0,0 +1,73 @@
|
|||
## ----setup, include=FALSE-----------------------------------------------------
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
|
||||
## ----results = F--------------------------------------------------------------
|
||||
library(magrittr)
|
||||
library(opensensmapr)
|
||||
|
||||
all_sensors = osem_boxes()
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
summary(all_sensors)
|
||||
|
||||
## ----message=F, warning=F-----------------------------------------------------
|
||||
if (!require('maps')) install.packages('maps')
|
||||
if (!require('maptools')) install.packages('maptools')
|
||||
if (!require('rgeos')) install.packages('rgeos')
|
||||
|
||||
plot(all_sensors)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
phenoms = osem_phenomena(all_sensors)
|
||||
str(phenoms)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
phenoms[phenoms > 20]
|
||||
|
||||
## ----results = F--------------------------------------------------------------
|
||||
pm25_sensors = osem_boxes(
|
||||
exposure = 'outdoor',
|
||||
date = Sys.time(), # ±4 hours
|
||||
phenomenon = 'PM2.5'
|
||||
)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
summary(pm25_sensors)
|
||||
plot(pm25_sensors)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
library(sf)
|
||||
library(units)
|
||||
library(lubridate)
|
||||
library(dplyr)
|
||||
|
||||
# construct a bounding box: 12 kilometers around Berlin
|
||||
berlin = st_point(c(13.4034, 52.5120)) %>%
|
||||
st_sfc(crs = 4326) %>%
|
||||
st_transform(3857) %>% # allow setting a buffer in meters
|
||||
st_buffer(set_units(12, km)) %>%
|
||||
st_transform(4326) %>% # the opensensemap expects WGS 84
|
||||
st_bbox()
|
||||
|
||||
## ----results = F--------------------------------------------------------------
|
||||
pm25 = osem_measurements(
|
||||
berlin,
|
||||
phenomenon = 'PM2.5',
|
||||
from = now() - days(3), # defaults to 2 days
|
||||
to = now()
|
||||
)
|
||||
|
||||
plot(pm25)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
outliers = filter(pm25, value > 100)$sensorId
|
||||
bad_sensors = outliers[, drop = T] %>% levels()
|
||||
|
||||
pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
|
||||
|
||||
## -----------------------------------------------------------------------------
|
||||
pm25 %>% filter(invalid == FALSE) %>% plot()
|
||||
|
151
inst/doc/osem-intro.Rmd
Normal file
151
inst/doc/osem-intro.Rmd
Normal file
|
@ -0,0 +1,151 @@
|
|||
---
|
||||
title: "Exploring the openSenseMap Dataset"
|
||||
author: "Norwin Roosen"
|
||||
date: "`r Sys.Date()`"
|
||||
output:
|
||||
rmarkdown::html_vignette:
|
||||
fig_margin: 0
|
||||
fig_width: 6
|
||||
fig_height: 4
|
||||
vignette: >
|
||||
%\VignetteIndexEntry{Exploring the openSenseMap Dataset}
|
||||
%\VignetteEngine{knitr::rmarkdown}
|
||||
%\VignetteEncoding{UTF-8}
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
This package provides data ingestion functions for almost any data stored on the
|
||||
open data platform for environmental sensordata <https://opensensemap.org>.
|
||||
Its main goals are to provide means for:
|
||||
|
||||
- big data analysis of the measurements stored on the platform
|
||||
- sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
|
||||
|
||||
### Exploring the dataset
|
||||
Before we look at actual observations, lets get a grasp of the openSenseMap
|
||||
datasets' structure.
|
||||
|
||||
```{r results = F}
|
||||
library(magrittr)
|
||||
library(opensensmapr)
|
||||
|
||||
all_sensors = osem_boxes()
|
||||
```
|
||||
```{r}
|
||||
summary(all_sensors)
|
||||
```
|
||||
|
||||
This gives a good overview already: As of writing this, there are more than 700
|
||||
sensor stations, of which ~50% are currently running. Most of them are placed
|
||||
outdoors and have around 5 sensors each.
|
||||
The oldest station is from May 2014, while the latest station was registered a
|
||||
couple of minutes ago.
|
||||
|
||||
Another feature of interest is the spatial distribution of the boxes: `plot()`
|
||||
can help us out here. This function requires a bunch of optional dependencies though.
|
||||
|
||||
```{r message=F, warning=F}
|
||||
if (!require('maps')) install.packages('maps')
|
||||
if (!require('maptools')) install.packages('maptools')
|
||||
if (!require('rgeos')) install.packages('rgeos')
|
||||
|
||||
plot(all_sensors)
|
||||
```
|
||||
|
||||
It seems we have to reduce our area of interest to Germany.
|
||||
|
||||
But what do these sensor stations actually measure? Lets find out.
|
||||
`osem_phenomena()` gives us a named list of of the counts of each observed
|
||||
phenomenon for the given set of sensor stations:
|
||||
|
||||
```{r}
|
||||
phenoms = osem_phenomena(all_sensors)
|
||||
str(phenoms)
|
||||
```
|
||||
|
||||
Thats quite some noise there, with many phenomena being measured by a single
|
||||
sensor only, or many duplicated phenomena due to slightly different spellings.
|
||||
We should clean that up, but for now let's just filter out the noise and find
|
||||
those phenomena with high sensor numbers:
|
||||
|
||||
```{r}
|
||||
phenoms[phenoms > 20]
|
||||
```
|
||||
|
||||
Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
|
||||
interesting to analyze though.
|
||||
We should check how many sensor stations provide useful data: We want only those
|
||||
boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
|
||||
measurements:
|
||||
|
||||
```{r results = F}
|
||||
pm25_sensors = osem_boxes(
|
||||
exposure = 'outdoor',
|
||||
date = Sys.time(), # ±4 hours
|
||||
phenomenon = 'PM2.5'
|
||||
)
|
||||
```
|
||||
```{r}
|
||||
summary(pm25_sensors)
|
||||
plot(pm25_sensors)
|
||||
```
|
||||
|
||||
Thats still more than 200 measuring stations, we can work with that.
|
||||
|
||||
### Analyzing sensor data
|
||||
Having analyzed the available data sources, let's finally get some measurements.
|
||||
We could call `osem_measurements(pm25_sensors)` now, however we are focusing on
|
||||
a restricted area of interest, the city of Berlin.
|
||||
Luckily we can get the measurements filtered by a bounding box:
|
||||
|
||||
```{r}
|
||||
library(sf)
|
||||
library(units)
|
||||
library(lubridate)
|
||||
library(dplyr)
|
||||
|
||||
# construct a bounding box: 12 kilometers around Berlin
|
||||
berlin = st_point(c(13.4034, 52.5120)) %>%
|
||||
st_sfc(crs = 4326) %>%
|
||||
st_transform(3857) %>% # allow setting a buffer in meters
|
||||
st_buffer(set_units(12, km)) %>%
|
||||
st_transform(4326) %>% # the opensensemap expects WGS 84
|
||||
st_bbox()
|
||||
```
|
||||
```{r results = F}
|
||||
pm25 = osem_measurements(
|
||||
berlin,
|
||||
phenomenon = 'PM2.5',
|
||||
from = now() - days(3), # defaults to 2 days
|
||||
to = now()
|
||||
)
|
||||
|
||||
plot(pm25)
|
||||
```
|
||||
|
||||
Now we can get started with actual spatiotemporal data analysis.
|
||||
First, lets mask the seemingly uncalibrated sensors:
|
||||
|
||||
```{r}
|
||||
outliers = filter(pm25, value > 100)$sensorId
|
||||
bad_sensors = outliers[, drop = T] %>% levels()
|
||||
|
||||
pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
|
||||
```
|
||||
|
||||
Then plot the measuring locations, flagging the outliers:
|
||||
|
||||
```{r}
|
||||
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
|
||||
```
|
||||
|
||||
Removing these sensors yields a nicer time series plot:
|
||||
|
||||
```{r}
|
||||
pm25 %>% filter(invalid == FALSE) %>% plot()
|
||||
```
|
||||
|
||||
Further analysis: comparison with LANUV data `TODO`
|
870
inst/doc/osem-intro.html
Normal file
870
inst/doc/osem-intro.html
Normal file
File diff suppressed because one or more lines are too long
51
inst/doc/osem-serialization.R
Normal file
51
inst/doc/osem-serialization.R
Normal file
|
@ -0,0 +1,51 @@
|
|||
## ----setup, results='hide'----------------------------------------------------
|
||||
# this vignette requires:
|
||||
library(opensensmapr)
|
||||
library(jsonlite)
|
||||
library(readr)
|
||||
|
||||
## ----cache--------------------------------------------------------------------
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
||||
|
||||
# the next identical request will hit the cache only!
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
||||
|
||||
# requests without the cache parameter will still be performed normally
|
||||
b = osem_boxes(grouptag = 'ifgi')
|
||||
|
||||
## ----cachelisting-------------------------------------------------------------
|
||||
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
|
||||
|
||||
## ----cache_custom-------------------------------------------------------------
|
||||
cacheDir = getwd() # current working directory
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
||||
|
||||
# the next identical request will hit the cache only!
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
||||
|
||||
## ----clearcache, results='hide'-----------------------------------------------
|
||||
osem_clear_cache() # clears default cache
|
||||
osem_clear_cache(getwd()) # clears a custom cache
|
||||
|
||||
## ----data, results='hide'-----------------------------------------------------
|
||||
# first get our example data:
|
||||
measurements = osem_measurements('Windgeschwindigkeit')
|
||||
|
||||
## ----serialize_json-----------------------------------------------------------
|
||||
# serializing senseBoxes to JSON, and loading from file again:
|
||||
write(jsonlite::serializeJSON(measurements), 'measurements.json')
|
||||
measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
|
||||
class(measurements_from_file)
|
||||
|
||||
## ----serialize_attrs----------------------------------------------------------
|
||||
# note the toJSON call instead of serializeJSON
|
||||
write(jsonlite::toJSON(measurements), 'measurements_bad.json')
|
||||
measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
|
||||
class(measurements_without_attrs)
|
||||
|
||||
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
|
||||
class(measurements_with_attrs)
|
||||
|
||||
## ----cleanup, include=FALSE---------------------------------------------------
|
||||
file.remove('measurements.json', 'measurements_bad.json')
|
||||
|
106
inst/doc/osem-serialization.Rmd
Normal file
106
inst/doc/osem-serialization.Rmd
Normal file
|
@ -0,0 +1,106 @@
|
|||
---
|
||||
title: "Caching openSenseMap Data for Reproducibility"
|
||||
author: "Norwin Roosen"
|
||||
date: "`r Sys.Date()`"
|
||||
output: rmarkdown::html_vignette
|
||||
vignette: >
|
||||
%\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility}
|
||||
%\VignetteEngine{knitr::rmarkdown}
|
||||
%\VignetteEncoding{UTF-8}
|
||||
---
|
||||
|
||||
It may be useful to download data from openSenseMap only once.
|
||||
For reproducible results, the data should be saved to disk, and reloaded at a
|
||||
later point.
|
||||
|
||||
This avoids..
|
||||
|
||||
- changed results for queries without date parameters,
|
||||
- unnecessary wait times,
|
||||
- risk of API changes / API unavailability,
|
||||
- stress on the openSenseMap-server.
|
||||
|
||||
This vignette shows how to use this built in `opensensmapr` feature, and
|
||||
how to do it yourself in case you want to save to other data formats.
|
||||
|
||||
```{r setup, results='hide'}
|
||||
# this vignette requires:
|
||||
library(opensensmapr)
|
||||
library(jsonlite)
|
||||
library(readr)
|
||||
```
|
||||
|
||||
## Using the opensensmapr Caching Feature
|
||||
All data retrieval functions of `opensensmapr` have a built in caching feature,
|
||||
which serializes an API response to disk.
|
||||
Subsequent identical requests will then return the serialized data instead of making
|
||||
another request.
|
||||
|
||||
To use this feature, just add a path to a directory to the `cache` parameter:
|
||||
```{r cache}
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
||||
|
||||
# the next identical request will hit the cache only!
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
|
||||
|
||||
# requests without the cache parameter will still be performed normally
|
||||
b = osem_boxes(grouptag = 'ifgi')
|
||||
```
|
||||
|
||||
Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
|
||||
```{r cachelisting}
|
||||
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
|
||||
```
|
||||
|
||||
You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
|
||||
```{r cache_custom}
|
||||
cacheDir = getwd() # current working directory
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
||||
|
||||
# the next identical request will hit the cache only!
|
||||
b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
|
||||
```
|
||||
|
||||
To get fresh results again, just call `osem_clear_cache()` for the respective cache:
|
||||
```{r clearcache, results='hide'}
|
||||
osem_clear_cache() # clears default cache
|
||||
osem_clear_cache(getwd()) # clears a custom cache
|
||||
```
|
||||
|
||||
## Custom (De-) Serialization
|
||||
If you want to roll your own serialization method to support custom data formats,
|
||||
here's how:
|
||||
|
||||
```{r data, results='hide'}
|
||||
# first get our example data:
|
||||
measurements = osem_measurements('Windgeschwindigkeit')
|
||||
```
|
||||
|
||||
If you are paranoid and worry about `.rds` files not being decodable anymore
|
||||
in the (distant) future, you could serialize to a plain text format such as JSON.
|
||||
This of course comes at the cost of storage space and performance.
|
||||
```{r serialize_json}
|
||||
# serializing senseBoxes to JSON, and loading from file again:
|
||||
write(jsonlite::serializeJSON(measurements), 'measurements.json')
|
||||
measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
|
||||
class(measurements_from_file)
|
||||
```
|
||||
|
||||
This method also persists the R object metadata (classes, attributes).
|
||||
If you were to use a serialization method that can't persist object metadata, you
|
||||
could re-apply it with the following functions:
|
||||
|
||||
```{r serialize_attrs}
|
||||
# note the toJSON call instead of serializeJSON
|
||||
write(jsonlite::toJSON(measurements), 'measurements_bad.json')
|
||||
measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
|
||||
class(measurements_without_attrs)
|
||||
|
||||
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
|
||||
class(measurements_with_attrs)
|
||||
```
|
||||
The same goes for boxes via `osem_as_sensebox()`.
|
||||
|
||||
```{r cleanup, include=FALSE}
|
||||
file.remove('measurements.json', 'measurements_bad.json')
|
||||
```
|
444
inst/doc/osem-serialization.html
Normal file
444
inst/doc/osem-serialization.html
Normal file
|
@ -0,0 +1,444 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8" />
|
||||
<meta name="generator" content="pandoc" />
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<meta name="author" content="Norwin Roosen" />
|
||||
|
||||
<meta name="date" content="2023-02-23" />
|
||||
|
||||
<title>Caching openSenseMap Data for Reproducibility</title>
|
||||
|
||||
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
|
||||
// be compatible with the behavior of Pandoc < 2.8).
|
||||
document.addEventListener('DOMContentLoaded', function(e) {
|
||||
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
|
||||
var i, h, a;
|
||||
for (i = 0; i < hs.length; i++) {
|
||||
h = hs[i];
|
||||
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
|
||||
a = h.attributes;
|
||||
while (a.length > 0) h.removeAttribute(a[0].name);
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
<style type="text/css">
|
||||
code{white-space: pre-wrap;}
|
||||
span.smallcaps{font-variant: small-caps;}
|
||||
span.underline{text-decoration: underline;}
|
||||
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||||
ul.task-list{list-style: none;}
|
||||
</style>
|
||||
|
||||
|
||||
|
||||
<style type="text/css">
|
||||
code {
|
||||
white-space: pre;
|
||||
}
|
||||
.sourceCode {
|
||||
overflow: visible;
|
||||
}
|
||||
</style>
|
||||
<style type="text/css" data-origin="pandoc">
|
||||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||||
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
|
||||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||||
.sourceCode { overflow: visible; }
|
||||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||||
div.sourceCode { margin: 1em 0; }
|
||||
pre.sourceCode { margin: 0; }
|
||||
@media screen {
|
||||
div.sourceCode { overflow: auto; }
|
||||
}
|
||||
@media print {
|
||||
pre > code.sourceCode { white-space: pre-wrap; }
|
||||
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
||||
}
|
||||
pre.numberSource code
|
||||
{ counter-reset: source-line 0; }
|
||||
pre.numberSource code > span
|
||||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||||
pre.numberSource code > span > a:first-child::before
|
||||
{ content: counter(source-line);
|
||||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||||
border: none; display: inline-block;
|
||||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||||
-khtml-user-select: none; -moz-user-select: none;
|
||||
-ms-user-select: none; user-select: none;
|
||||
padding: 0 4px; width: 4em;
|
||||
color: #aaaaaa;
|
||||
}
|
||||
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
||||
div.sourceCode
|
||||
{ }
|
||||
@media screen {
|
||||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||||
}
|
||||
code span.al { color: #ff0000; font-weight: bold; }
|
||||
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||||
code span.at { color: #7d9029; }
|
||||
code span.bn { color: #40a070; }
|
||||
code span.bu { color: #008000; }
|
||||
code span.cf { color: #007020; font-weight: bold; }
|
||||
code span.ch { color: #4070a0; }
|
||||
code span.cn { color: #880000; }
|
||||
code span.co { color: #60a0b0; font-style: italic; }
|
||||
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||||
code span.do { color: #ba2121; font-style: italic; }
|
||||
code span.dt { color: #902000; }
|
||||
code span.dv { color: #40a070; }
|
||||
code span.er { color: #ff0000; font-weight: bold; }
|
||||
code span.ex { }
|
||||
code span.fl { color: #40a070; }
|
||||
code span.fu { color: #06287e; }
|
||||
code span.im { color: #008000; font-weight: bold; }
|
||||
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||||
code span.kw { color: #007020; font-weight: bold; }
|
||||
code span.op { color: #666666; }
|
||||
code span.ot { color: #007020; }
|
||||
code span.pp { color: #bc7a00; }
|
||||
code span.sc { color: #4070a0; }
|
||||
code span.ss { color: #bb6688; }
|
||||
code span.st { color: #4070a0; }
|
||||
code span.va { color: #19177c; }
|
||||
code span.vs { color: #4070a0; }
|
||||
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
|
||||
</style>
|
||||
<script>
|
||||
// apply pandoc div.sourceCode style to pre.sourceCode instead
|
||||
(function() {
|
||||
var sheets = document.styleSheets;
|
||||
for (var i = 0; i < sheets.length; i++) {
|
||||
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
|
||||
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
|
||||
var j = 0;
|
||||
while (j < rules.length) {
|
||||
var rule = rules[j];
|
||||
// check if there is a div.sourceCode rule
|
||||
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
var style = rule.style.cssText;
|
||||
// check if color or background-color is set
|
||||
if (rule.style.color === '' && rule.style.backgroundColor === '') {
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
// replace div.sourceCode by a pre.sourceCode rule
|
||||
sheets[i].deleteRule(j);
|
||||
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
|
||||
}
|
||||
}
|
||||
})();
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
<style type="text/css">body {
|
||||
background-color: #fff;
|
||||
margin: 1em auto;
|
||||
max-width: 700px;
|
||||
overflow: visible;
|
||||
padding-left: 2em;
|
||||
padding-right: 2em;
|
||||
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
||||
font-size: 14px;
|
||||
line-height: 1.35;
|
||||
}
|
||||
#TOC {
|
||||
clear: both;
|
||||
margin: 0 0 10px 10px;
|
||||
padding: 4px;
|
||||
width: 400px;
|
||||
border: 1px solid #CCCCCC;
|
||||
border-radius: 5px;
|
||||
background-color: #f6f6f6;
|
||||
font-size: 13px;
|
||||
line-height: 1.3;
|
||||
}
|
||||
#TOC .toctitle {
|
||||
font-weight: bold;
|
||||
font-size: 15px;
|
||||
margin-left: 5px;
|
||||
}
|
||||
#TOC ul {
|
||||
padding-left: 40px;
|
||||
margin-left: -1.5em;
|
||||
margin-top: 5px;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
#TOC ul ul {
|
||||
margin-left: -2em;
|
||||
}
|
||||
#TOC li {
|
||||
line-height: 16px;
|
||||
}
|
||||
table {
|
||||
margin: 1em auto;
|
||||
border-width: 1px;
|
||||
border-color: #DDDDDD;
|
||||
border-style: outset;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
table th {
|
||||
border-width: 2px;
|
||||
padding: 5px;
|
||||
border-style: inset;
|
||||
}
|
||||
table td {
|
||||
border-width: 1px;
|
||||
border-style: inset;
|
||||
line-height: 18px;
|
||||
padding: 5px 5px;
|
||||
}
|
||||
table, table th, table td {
|
||||
border-left-style: none;
|
||||
border-right-style: none;
|
||||
}
|
||||
table thead, table tr.even {
|
||||
background-color: #f7f7f7;
|
||||
}
|
||||
p {
|
||||
margin: 0.5em 0;
|
||||
}
|
||||
blockquote {
|
||||
background-color: #f6f6f6;
|
||||
padding: 0.25em 0.75em;
|
||||
}
|
||||
hr {
|
||||
border-style: solid;
|
||||
border: none;
|
||||
border-top: 1px solid #777;
|
||||
margin: 28px 0;
|
||||
}
|
||||
dl {
|
||||
margin-left: 0;
|
||||
}
|
||||
dl dd {
|
||||
margin-bottom: 13px;
|
||||
margin-left: 13px;
|
||||
}
|
||||
dl dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
ul {
|
||||
margin-top: 0;
|
||||
}
|
||||
ul li {
|
||||
list-style: circle outside;
|
||||
}
|
||||
ul ul {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
pre, code {
|
||||
background-color: #f7f7f7;
|
||||
border-radius: 3px;
|
||||
color: #333;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
pre {
|
||||
border-radius: 3px;
|
||||
margin: 5px 0px 10px 0px;
|
||||
padding: 10px;
|
||||
}
|
||||
pre:not([class]) {
|
||||
background-color: #f7f7f7;
|
||||
}
|
||||
code {
|
||||
font-family: Consolas, Monaco, 'Courier New', monospace;
|
||||
font-size: 85%;
|
||||
}
|
||||
p > code, li > code {
|
||||
padding: 2px 0px;
|
||||
}
|
||||
div.figure {
|
||||
text-align: center;
|
||||
}
|
||||
img {
|
||||
background-color: #FFFFFF;
|
||||
padding: 2px;
|
||||
border: 1px solid #DDDDDD;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #CCCCCC;
|
||||
margin: 0 5px;
|
||||
}
|
||||
h1 {
|
||||
margin-top: 0;
|
||||
font-size: 35px;
|
||||
line-height: 40px;
|
||||
}
|
||||
h2 {
|
||||
border-bottom: 4px solid #f7f7f7;
|
||||
padding-top: 10px;
|
||||
padding-bottom: 2px;
|
||||
font-size: 145%;
|
||||
}
|
||||
h3 {
|
||||
border-bottom: 2px solid #f7f7f7;
|
||||
padding-top: 10px;
|
||||
font-size: 120%;
|
||||
}
|
||||
h4 {
|
||||
border-bottom: 1px solid #f7f7f7;
|
||||
margin-left: 8px;
|
||||
font-size: 105%;
|
||||
}
|
||||
h5, h6 {
|
||||
border-bottom: 1px solid #ccc;
|
||||
font-size: 105%;
|
||||
}
|
||||
a {
|
||||
color: #0033dd;
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover {
|
||||
color: #6666ff; }
|
||||
a:visited {
|
||||
color: #800080; }
|
||||
a:visited:hover {
|
||||
color: #BB00BB; }
|
||||
a[href^="http:"] {
|
||||
text-decoration: underline; }
|
||||
a[href^="https:"] {
|
||||
text-decoration: underline; }
|
||||
|
||||
code > span.kw { color: #555; font-weight: bold; }
|
||||
code > span.dt { color: #902000; }
|
||||
code > span.dv { color: #40a070; }
|
||||
code > span.bn { color: #d14; }
|
||||
code > span.fl { color: #d14; }
|
||||
code > span.ch { color: #d14; }
|
||||
code > span.st { color: #d14; }
|
||||
code > span.co { color: #888888; font-style: italic; }
|
||||
code > span.ot { color: #007020; }
|
||||
code > span.al { color: #ff0000; font-weight: bold; }
|
||||
code > span.fu { color: #900; font-weight: bold; }
|
||||
code > span.er { color: #a61717; background-color: #e3d2d2; }
|
||||
</style>
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 class="title toc-ignore">Caching openSenseMap Data for
|
||||
Reproducibility</h1>
|
||||
<h4 class="author">Norwin Roosen</h4>
|
||||
<h4 class="date">2023-02-23</h4>
|
||||
|
||||
|
||||
|
||||
<p>It may be useful to download data from openSenseMap only once. For
|
||||
reproducible results, the data should be saved to disk, and reloaded at
|
||||
a later point.</p>
|
||||
<p>This avoids..</p>
|
||||
<ul>
|
||||
<li>changed results for queries without date parameters,</li>
|
||||
<li>unnecessary wait times,</li>
|
||||
<li>risk of API changes / API unavailability,</li>
|
||||
<li>stress on the openSenseMap-server.</li>
|
||||
</ul>
|
||||
<p>This vignette shows how to use this built in
|
||||
<code>opensensmapr</code> feature, and how to do it yourself in case you
|
||||
want to save to other data formats.</p>
|
||||
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># this vignette requires:</span></span>
|
||||
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(opensensmapr)</span>
|
||||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(jsonlite)</span>
|
||||
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(readr)</span></code></pre></div>
|
||||
<div id="using-the-opensensmapr-caching-feature" class="section level2">
|
||||
<h2>Using the opensensmapr Caching Feature</h2>
|
||||
<p>All data retrieval functions of <code>opensensmapr</code> have a
|
||||
built in caching feature, which serializes an API response to disk.
|
||||
Subsequent identical requests will then return the serialized data
|
||||
instead of making another request.</p>
|
||||
<p>To use this feature, just add a path to a directory to the
|
||||
<code>cache</code> parameter:</p>
|
||||
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">'ifgi'</span>, <span class="at">cache =</span> <span class="fu">tempdir</span>())</span>
|
||||
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># the next identical request will hit the cache only!</span></span>
|
||||
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">'ifgi'</span>, <span class="at">cache =</span> <span class="fu">tempdir</span>())</span>
|
||||
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co"># requests without the cache parameter will still be performed normally</span></span>
|
||||
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">'ifgi'</span>)</span></code></pre></div>
|
||||
<p>Looking at the cache directory we can see one file for each request,
|
||||
which is identified through a hash of the request URL:</p>
|
||||
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">list.files</span>(<span class="fu">tempdir</span>(), <span class="at">pattern =</span> <span class="st">'osemcache</span><span class="sc">\\</span><span class="st">..*</span><span class="sc">\\</span><span class="st">.rds'</span>)</span></code></pre></div>
|
||||
<pre><code>## [1] "osemcache.17db5c57fc6fca4d836fa2cf30345ce8767cd61a.rds"</code></pre>
|
||||
<p>You can maintain multiple caches simultaneously which allows to only
|
||||
store data related to a script in the same directory:</p>
|
||||
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>cacheDir <span class="ot">=</span> <span class="fu">getwd</span>() <span class="co"># current working directory</span></span>
|
||||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">'ifgi'</span>, <span class="at">cache =</span> cacheDir)</span>
|
||||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># the next identical request will hit the cache only!</span></span>
|
||||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>b <span class="ot">=</span> <span class="fu">osem_boxes</span>(<span class="at">grouptag =</span> <span class="st">'ifgi'</span>, <span class="at">cache =</span> cacheDir)</span></code></pre></div>
|
||||
<p>To get fresh results again, just call <code>osem_clear_cache()</code>
|
||||
for the respective cache:</p>
|
||||
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">osem_clear_cache</span>() <span class="co"># clears default cache</span></span>
|
||||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="fu">osem_clear_cache</span>(<span class="fu">getwd</span>()) <span class="co"># clears a custom cache</span></span></code></pre></div>
|
||||
</div>
|
||||
<div id="custom-de--serialization" class="section level2">
|
||||
<h2>Custom (De-) Serialization</h2>
|
||||
<p>If you want to roll your own serialization method to support custom
|
||||
data formats, here’s how:</p>
|
||||
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># first get our example data:</span></span>
|
||||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>measurements <span class="ot">=</span> <span class="fu">osem_measurements</span>(<span class="st">'Windgeschwindigkeit'</span>)</span></code></pre></div>
|
||||
<p>If you are paranoid and worry about <code>.rds</code> files not being
|
||||
decodable anymore in the (distant) future, you could serialize to a
|
||||
plain text format such as JSON. This of course comes at the cost of
|
||||
storage space and performance.</p>
|
||||
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># serializing senseBoxes to JSON, and loading from file again:</span></span>
|
||||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="fu">write</span>(jsonlite<span class="sc">::</span><span class="fu">serializeJSON</span>(measurements), <span class="st">'measurements.json'</span>)</span>
|
||||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>measurements_from_file <span class="ot">=</span> jsonlite<span class="sc">::</span><span class="fu">unserializeJSON</span>(readr<span class="sc">::</span><span class="fu">read_file</span>(<span class="st">'measurements.json'</span>))</span>
|
||||
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_from_file)</span></code></pre></div>
|
||||
<pre><code>## [1] "osem_measurements" "tbl_df" "tbl"
|
||||
## [4] "data.frame"</code></pre>
|
||||
<p>This method also persists the R object metadata (classes,
|
||||
attributes). If you were to use a serialization method that can’t
|
||||
persist object metadata, you could re-apply it with the following
|
||||
functions:</p>
|
||||
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># note the toJSON call instead of serializeJSON</span></span>
|
||||
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="fu">write</span>(jsonlite<span class="sc">::</span><span class="fu">toJSON</span>(measurements), <span class="st">'measurements_bad.json'</span>)</span>
|
||||
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>measurements_without_attrs <span class="ot">=</span> jsonlite<span class="sc">::</span><span class="fu">fromJSON</span>(<span class="st">'measurements_bad.json'</span>)</span>
|
||||
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_without_attrs)</span></code></pre></div>
|
||||
<pre><code>## [1] "data.frame"</code></pre>
|
||||
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>measurements_with_attrs <span class="ot">=</span> <span class="fu">osem_as_measurements</span>(measurements_without_attrs)</span>
|
||||
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(measurements_with_attrs)</span></code></pre></div>
|
||||
<pre><code>## [1] "osem_measurements" "tbl_df" "tbl"
|
||||
## [4] "data.frame"</code></pre>
|
||||
<p>The same goes for boxes via <code>osem_as_sensebox()</code>.</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- code folding -->
|
||||
|
||||
|
||||
<!-- dynamically load mathjax for compatibility with self-contained -->
|
||||
<script>
|
||||
(function () {
|
||||
var script = document.createElement("script");
|
||||
script.type = "text/javascript";
|
||||
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
|
||||
document.getElementsByTagName("head")[0].appendChild(script);
|
||||
})();
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Add table
Reference in a new issue