diff --git a/inst/doc/osem-history.R b/inst/doc/osem-history.R deleted file mode 100644 index 09048eb..0000000 --- a/inst/doc/osem-history.R +++ /dev/null @@ -1,133 +0,0 @@ -## ----setup, results='hide', message=FALSE, warning=FALSE----------------- -# required packages: -library(opensensmapr) # data download -library(dplyr) # data wrangling -library(ggplot2) # plotting -library(lubridate) # date arithmetic -library(zoo) # rollmean() - -## ----download------------------------------------------------------------ -# if you want to see results for a specific subset of boxes, -# just specify a filter such as grouptag='ifgi' here -boxes = osem_boxes() - -## ----exposure_counts, message=FALSE-------------------------------------- -exposure_counts = boxes %>% - group_by(exposure) %>% - mutate(count = row_number(createdAt)) - -exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') -ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + - geom_line() + - scale_colour_manual(values = exposure_colors) + - xlab('Registration Date') + ylab('senseBox count') - -## ----exposure_summary---------------------------------------------------- -exposure_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) - -## ----grouptag_counts, message=FALSE-------------------------------------- -grouptag_counts = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% - mutate(count = row_number(createdAt)) - -# helper for sorting the grouptags by boxcount -sortLvls = function(oldFactor, ascending = TRUE) { - lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() - factor(oldFactor, levels = lvls) -} -grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) - -ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + - geom_line(aes(group = grouptag)) + - xlab('Registration Date') + ylab('senseBox count') - -## ----grouptag_summary---------------------------------------------------- -grouptag_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) - -## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'---- -bins = 'week' -mvavg_bins = 6 - -growth = boxes %>% - mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'registered') - -## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'---- -inactive = boxes %>% - # remove boxes that were updated in the last two days, - # b/c any box becomes inactive at some point by definition of updatedAt - filter(updatedAt < now() - days(2)) %>% - mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'inactive') - -## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------ -boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) - -ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + - xlab('Time') + ylab(paste('rate per ', bins)) + - scale_x_date(date_breaks="years", date_labels="%Y") + - scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + - geom_point(aes(y = count), size = 0.5) + - # moving average, make first and last value NA (to ensure identical length of vectors) - geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) - -## ----exposure_duration, message=FALSE------------------------------------ -duration = boxes %>% - group_by(exposure) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = exposure, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -## ----grouptag_duration, message=FALSE------------------------------------ -duration = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = grouptag, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -duration %>% - summarize( - duration_avg = round(mean(duration)), - duration_min = round(min(duration)), - duration_max = round(max(duration)), - oldest_box = round(max(difftime(now(), createdAt, units='days'))) - ) %>% - arrange(desc(duration_avg)) - -## ----year_duration, message=FALSE---------------------------------------- -# NOTE: boxes older than 2016 missing due to missing updatedAt in database -duration = boxes %>% - mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% - group_by(year) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') - diff --git a/inst/doc/osem-history.Rmd b/inst/doc/osem-history.Rmd deleted file mode 100644 index 12aff6a..0000000 --- a/inst/doc/osem-history.Rmd +++ /dev/null @@ -1,243 +0,0 @@ ---- -title: "Visualising the History of openSenseMap.org" -author: "Norwin Roosen" -date: '`r Sys.Date()`' -output: - rmarkdown::html_vignette: - df_print: kable - fig_height: 5 - fig_width: 7 - toc: yes - html_document: - code_folding: hide - df_print: kable - theme: lumen - toc: yes - toc_float: yes -vignette: > - %\VignetteIndexEntry{Visualising the History of openSenseMap.org} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -> This vignette serves as an example on data wrangling & visualization with -`opensensmapr`, `dplyr` and `ggplot2`. - -```{r setup, results='hide', message=FALSE, warning=FALSE} -# required packages: -library(opensensmapr) # data download -library(dplyr) # data wrangling -library(ggplot2) # plotting -library(lubridate) # date arithmetic -library(zoo) # rollmean() -``` - -openSenseMap.org has grown quite a bit in the last years; it would be interesting -to see how we got to the current `r osem_counts()$boxes` sensor stations, -split up by various attributes of the boxes. - -While `opensensmapr` provides extensive methods of filtering boxes by attributes -on the server, we do the filtering within R to save time and gain flexibility. -So the first step is to retrieve *all the boxes*: - -```{r download} -# if you want to see results for a specific subset of boxes, -# just specify a filter such as grouptag='ifgi' here -boxes = osem_boxes() -``` - -# Plot count of boxes by time {.tabset} -By looking at the `createdAt` attribute of each box we know the exact time a box -was registered. -With this approach we have no information about boxes that were deleted in the -meantime, but that's okay for now. - -## ...and exposure -```{r exposure_counts, message=FALSE} -exposure_counts = boxes %>% - group_by(exposure) %>% - mutate(count = row_number(createdAt)) - -exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') -ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + - geom_line() + - scale_colour_manual(values = exposure_colors) + - xlab('Registration Date') + ylab('senseBox count') -``` - -Outdoor boxes are growing *fast*! -We can also see the introduction of `mobile` sensor "stations" in 2017. While -mobile boxes are still few, we can expect a quick rise in 2018 once the new -[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU). - -Let's have a quick summary: -```{r exposure_summary} -exposure_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -## ...and grouptag -We can try to find out where the increases in growth came from, by analysing the -box count by grouptag. - -Caveats: Only a small subset of boxes has a grouptag, and we should assume -that these groups are actually bigger. Also, we can see that grouptag naming is -inconsistent (`Luftdaten`, `luftdaten.info`, ...) - -```{r grouptag_counts, message=FALSE} -grouptag_counts = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% - mutate(count = row_number(createdAt)) - -# helper for sorting the grouptags by boxcount -sortLvls = function(oldFactor, ascending = TRUE) { - lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() - factor(oldFactor, levels = lvls) -} -grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) - -ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + - geom_line(aes(group = grouptag)) + - xlab('Registration Date') + ylab('senseBox count') -``` - -```{r grouptag_summary} -grouptag_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -# Plot rate of growth and inactivity per week -First we group the boxes by `createdAt` into bins of one week: -```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} -bins = 'week' -mvavg_bins = 6 - -growth = boxes %>% - mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'registered') -``` - -We can do the same for `updatedAt`, which informs us about the last change to -a box, including uploaded measurements. -This method of determining inactive boxes is fairly inaccurate and should be -considered an approximation, because we have no information about intermediate -inactive phases. -Also deleted boxes would probably have a big impact here. -```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} -inactive = boxes %>% - # remove boxes that were updated in the last two days, - # b/c any box becomes inactive at some point by definition of updatedAt - filter(updatedAt < now() - days(2)) %>% - mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'inactive') -``` - -Now we can combine both datasets for plotting: -```{r growthrate, warning=FALSE, message=FALSE, results='hide'} -boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) - -ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + - xlab('Time') + ylab(paste('rate per ', bins)) + - scale_x_date(date_breaks="years", date_labels="%Y") + - scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + - geom_point(aes(y = count), size = 0.5) + - # moving average, make first and last value NA (to ensure identical length of vectors) - geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) -``` - -We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`. -This was enabled by an integration of openSenseMap.org into the firmware of the -air quality monitoring project [luftdaten.info](https://luftdaten.info). -The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues -of the senseBox hardware, but I have no data on the exact time frames to verify. - -# Plot duration of boxes being active {.tabset} -While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity -of each box, and look at metrics by exposure and grouptag once more: - -## ...by exposure -```{r exposure_duration, message=FALSE} -duration = boxes %>% - group_by(exposure) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = exposure, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') -``` - -The time of activity averages at only `r round(mean(duration$duration))` days, -though there are boxes with `r round(max(duration$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by grouptag -```{r grouptag_duration, message=FALSE} -duration = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = grouptag, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -duration %>% - summarize( - duration_avg = round(mean(duration)), - duration_min = round(min(duration)), - duration_max = round(max(duration)), - oldest_box = round(max(difftime(now(), createdAt, units='days'))) - ) %>% - arrange(desc(duration_avg)) -``` - -The time of activity averages at only `r round(mean(duration$duration))` days, -though there are boxes with `r round(max(duration$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by year of registration -This is less useful, as older boxes are active for a longer time by definition. -If you have an idea how to compensate for that, please send a [Pull Request][PR]! - -```{r year_duration, message=FALSE} -# NOTE: boxes older than 2016 missing due to missing updatedAt in database -duration = boxes %>% - mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% - group_by(year) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') -``` - -# More Visualisations -Other visualisations come to mind, and are left as an exercise to the reader. -If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. - -* growth by phenomenon -* growth by location -> (interactive) map -* set inactive rate in relation to total box count -* filter timespans with big dips in growth rate, and extrapolate the amount of - senseBoxes that could be on the platform today, assuming there were no production issues ;) - -[PR]: https://github.com/sensebox/opensensmapr/pulls diff --git a/inst/doc/osem-history.html b/inst/doc/osem-history.html deleted file mode 100644 index af507d9..0000000 --- a/inst/doc/osem-history.html +++ /dev/null @@ -1,501 +0,0 @@ - - - - -
- - - - - - - - - - - ---This vignette serves as an example on data wrangling & visualization with
-opensensmapr
,dplyr
andggplot2
.
# required packages:
-library(opensensmapr) # data download
-library(dplyr) # data wrangling
-library(ggplot2) # plotting
-library(lubridate) # date arithmetic
-library(zoo) # rollmean()
openSenseMap.org has grown quite a bit in the last years; it would be interesting to see how we got to the current 1781 sensor stations, split up by various attributes of the boxes.
-While opensensmapr
provides extensive methods of filtering boxes by attributes on the server, we do the filtering within R to save time and gain flexibility. So the first step is to retrieve all the boxes:
# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes = osem_boxes()
By looking at the createdAt
attribute of each box we know the exact time a box was registered. With this approach we have no information about boxes that were deleted in the meantime, but that’s okay for now.
exposure_counts = boxes %>%
- group_by(exposure) %>%
- mutate(count = row_number(createdAt))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
- geom_line() +
- scale_colour_manual(values = exposure_colors) +
- xlab('Registration Date') + ylab('senseBox count')
Outdoor boxes are growing fast! We can also see the introduction of mobile
sensor “stations” in 2017. While mobile boxes are still few, we can expect a quick rise in 2018 once the new senseBox MCU with GPS support is released.
Let’s have a quick summary:
-exposure_counts %>%
- summarise(
- oldest = min(createdAt),
- newest = max(createdAt),
- count = max(count)
- ) %>%
- arrange(desc(count))
exposure | -oldest | -newest | -count | -
---|---|---|---|
outdoor | -2015-02-18 16:53:41 | -2018-05-26 08:39:12 | -1416 | -
indoor | -2015-02-08 17:36:40 | -2018-05-26 10:29:27 | -290 | -
mobile | -2017-05-24 08:16:36 | -2018-05-24 07:08:32 | -55 | -
unknown | -2014-05-28 15:36:14 | -2016-06-25 15:11:11 | -20 | -
We can try to find out where the increases in growth came from, by analysing the box count by grouptag.
-Caveats: Only a small subset of boxes has a grouptag, and we should assume that these groups are actually bigger. Also, we can see that grouptag naming is inconsistent (Luftdaten
, luftdaten.info
, …)
grouptag_counts = boxes %>%
- group_by(grouptag) %>%
- # only include grouptags with 8 or more members
- filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
- mutate(count = row_number(createdAt))
-
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
- lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
- factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
- geom_line(aes(group = grouptag)) +
- xlab('Registration Date') + ylab('senseBox count')
grouptag_counts %>%
- summarise(
- oldest = min(createdAt),
- newest = max(createdAt),
- count = max(count)
- ) %>%
- arrange(desc(count))
grouptag | -oldest | -newest | -count | -
---|---|---|---|
Luftdaten | -2017-03-14 17:01:16 | -2018-05-21 02:20:50 | -109 | -
ifgi | -2016-06-17 08:04:54 | -2018-05-15 10:27:02 | -35 | -
MakeLight | -2015-02-18 16:53:41 | -2018-02-02 13:50:21 | -15 | -
Bad_Hersfeld | -2017-07-18 13:32:03 | -2018-03-22 09:10:07 | -13 | -
luftdaten.info | -2017-05-01 10:15:44 | -2018-05-17 11:47:21 | -12 | -
dwih-sp | -2016-08-09 08:06:02 | -2016-11-23 10:16:04 | -11 | -
Che Aria Tira? | -2018-03-11 10:50:42 | -2018-03-11 23:11:20 | -10 | -
Luftdaten.info | -2017-04-03 14:10:20 | -2018-04-16 16:31:24 | -10 | -
Feinstaub | -2017-04-08 06:38:25 | -2018-03-29 17:27:55 | -9 | -
PGKN | -2018-04-08 07:01:57 | -2018-04-27 18:38:51 | -9 | -
Raumanmeri | -2017-03-13 11:35:39 | -2017-04-27 05:36:20 | -9 | -
Sofia | -2017-04-11 04:40:11 | -2018-03-15 13:26:56 | -9 | -
IKG | -2017-03-21 19:02:11 | -2017-12-18 14:30:21 | -8 | -
First we group the boxes by createdAt
into bins of one week:
bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
- mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
- group_by(week) %>%
- summarize(count = length(week)) %>%
- mutate(event = 'registered')
We can do the same for updatedAt
, which informs us about the last change to a box, including uploaded measurements. This method of determining inactive boxes is fairly inaccurate and should be considered an approximation, because we have no information about intermediate inactive phases. Also deleted boxes would probably have a big impact here.
inactive = boxes %>%
- # remove boxes that were updated in the last two days,
- # b/c any box becomes inactive at some point by definition of updatedAt
- filter(updatedAt < now() - days(2)) %>%
- mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
- group_by(week) %>%
- summarize(count = length(week)) %>%
- mutate(event = 'inactive')
Now we can combine both datasets for plotting:
-boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
- xlab('Time') + ylab(paste('rate per ', bins)) +
- scale_x_date(date_breaks="years", date_labels="%Y") +
- scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
- geom_point(aes(y = count), size = 0.5) +
- # moving average, make first and last value NA (to ensure identical length of vectors)
- geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
We see a sudden rise in early 2017, which lines up with the fast growing grouptag Luftdaten
. This was enabled by an integration of openSenseMap.org into the firmware of the air quality monitoring project luftdaten.info. The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues of the senseBox hardware, but I have no data on the exact time frames to verify.
While we are looking at createdAt
and updatedAt
, we can also extract the duration of activity of each box, and look at metrics by exposure and grouptag once more:
duration = boxes %>%
- group_by(exposure) %>%
- filter(!is.na(updatedAt)) %>%
- mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = exposure, y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days')
The time of activity averages at only 152 days, though there are boxes with 759 days of activity, spanning a large chunk of openSenseMap’s existence.
-duration = boxes %>%
- group_by(grouptag) %>%
- # only include grouptags with 8 or more members
- filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
- mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = grouptag, y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days')
duration %>%
- summarize(
- duration_avg = round(mean(duration)),
- duration_min = round(min(duration)),
- duration_max = round(max(duration)),
- oldest_box = round(max(difftime(now(), createdAt, units='days')))
- ) %>%
- arrange(desc(duration_avg))
grouptag | -duration_avg | -duration_min | -duration_max | -oldest_box | -
---|---|---|---|---|
dwih-sp | -627 days | -549 days | -655 days | -655 days | -
Feinstaub | -219 days | -4 days | -413 days | -413 days | -
ifgi | -207 days | -0 days | -622 days | -708 days | -
Sofia | -200 days | -15 days | -410 days | -410 days | -
Bad_Hersfeld | -197 days | -65 days | -312 days | -312 days | -
Luftdaten | -187 days | -0 days | -424 days | -438 days | -
luftdaten.info | -183 days | -9 days | -360 days | -390 days | -
IKG | -163 days | -70 days | -260 days | -431 days | -
Luftdaten.info | -86 days | -5 days | -376 days | -418 days | -
Che Aria Tira? | -75 days | -71 days | -76 days | -76 days | -
Raumanmeri | -45 days | -7 days | -318 days | -439 days | -
PGKN | -35 days | -29 days | -48 days | -48 days | -
The time of activity averages at only 191 days, though there are boxes with 655 days of activity, spanning a large chunk of openSenseMap’s existence.
-This is less useful, as older boxes are active for a longer time by definition. If you have an idea how to compensate for that, please send a Pull Request!
-# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
- mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
- group_by(year) %>%
- filter(!is.na(updatedAt)) %>%
- mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
Other visualisations come to mind, and are left as an exercise to the reader. If you implemented some, feel free to add them to this vignette via a Pull Request.
---This vignette serves as an example on data wrangling & -visualization with
-opensensmapr
,dplyr
and -ggplot2
.
# required packages:
-#library(opensensmapr) # data download
-library(devtools)
-load_all(".")
-library(dplyr) # data wrangling
-library(ggplot2) # plotting
-library(lubridate) # date arithmetic
-library(zoo) # rollmean()
-openSenseMap.org has grown quite a bit in the last years; it would be -interesting to see how we got to the current 11307 sensor stations, -split up by various attributes of the boxes.
-While opensensmapr
provides extensive methods of
-filtering boxes by attributes on the server, we do the filtering within
-R to save time and gain flexibility.
So the first step is to retrieve all the boxes.
-# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes_all = osem_boxes()
-boxes = boxes_all
-In the following we just want to have a look at the boxes created in -2022, so we filter for them.
-boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
-summary(boxes) -> summary.data.frame
-## boxes total: 2132
-##
-## boxes by exposure:
-## indoor mobile outdoor unknown
-## 532 201 1398 1
-##
-## boxes by model:
-## custom hackair_home_v2 homeEthernet
-## 939 5 5
-## homeEthernetFeinstaub homeV2Ethernet homeV2EthernetFeinstaub
-## 2 5 5
-## homeV2Lora homeV2Wifi homeV2WifiFeinstaub
-## 62 226 116
-## homeWifi homeWifiFeinstaub luftdaten_pms1003
-## 14 17 0
-## luftdaten_pms1003_bme280 luftdaten_pms3003 luftdaten_pms3003_bme280
-## 1 0 0
-## luftdaten_pms5003 luftdaten_pms5003_bme280 luftdaten_pms7003
-## 2 8 0
-## luftdaten_pms7003_bme280 luftdaten_sds011 luftdaten_sds011_bme280
-## 8 29 465
-## luftdaten_sds011_bmp180 luftdaten_sds011_dht11 luftdaten_sds011_dht22
-## 27 7 189
-##
-## $last_measurement_within
-## 1h 1d 30d 365d never
-## 731 765 874 1571 522
-##
-## oldest box: 2020-02-29 23:00:31 (Kirchardt 1)
-## newest box: 2022-12-30 09:19:46 (Balkon)
-##
-## sensors per box:
-## Min. 1st Qu. Median Mean 3rd Qu. Max.
-## 1.000 3.000 5.000 4.959 6.000 29.000
-
-
-
-
-
-Another feature of interest is the spatial distribution of the boxes:
-plot()
can help us out here. This function requires a bunch
-of optional dependencies though.
if (!require('maps')) install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos')) install.packages('rgeos')
-
-plot(boxes)
-
-But what do these sensor stations actually measure? Lets find out.
-osem_phenomena()
gives us a named list of of the counts of
-each observed phenomenon for the given set of sensor stations:
phenoms = osem_phenomena(boxes)
-str(phenoms)
-## List of 969
-## $ Temperatur : int 1543
-## $ rel. Luftfeuchte : int 1306
-## $ PM10 : int 1080
-## $ PM2.5 : int 1079
-## $ Luftdruck : int 1004
-## $ Beleuchtungsstärke : int 290
-## $ UV-Intensität : int 290
-## $ VOC : int 224
-## $ Lufttemperatur : int 208
-## $ CO₂ : int 179
-## $ Bodenfeuchte : int 173
-## $ Temperature : int 167
-## $ Lautstärke : int 136
-## $ Luftfeuchte : int 134
-## $ Humidity : int 124
-## $ atm. Luftdruck : int 114
-## $ Kalibrierungswert : int 108
-## $ CO2eq : int 107
-## $ IAQ : int 107
-## $ rel. Luftfeuchte SCD30 : int 107
-## $ Temperatur SCD30 : int 107
-## $ Pressure : int 98
-## $ Luftfeuchtigkeit : int 61
-## $ Bodentemperatur : int 58
-## $ PM01 : int 58
-## $ Windgeschwindigkeit : int 46
-## $ Feinstaub PM10 : int 33
-## $ Feinstaub PM2.5 : int 24
-## $ Batterie : int 22
-## $ Feinstaub PM1.0 : int 20
-## $ Taupunkt : int 19
-## $ Windrichtung : int 19
-## $ rel. Luftfeuchte (HECA) : int 16
-## $ Temperatur (HECA) : int 15
-## $ Temperatura : int 15
-## $ Durchschnitt Umgebungslautstärke : int 14
-## $ Helligkeit : int 14
-## $ Latitude : int 14
-## $ Longtitude : int 14
-## $ Minimum Umgebungslautstärke : int 14
-## $ RSSI : int 14
-## $ UV-Index : int 11
-## $ CO2 : int 10
-## $ PM1 : int 10
-## $ rel-. Luftfeuchte : int 10
-## $ temperature : int 10
-## $ Abstand nach links : int 9
-## $ Abstand nach rechts : int 9
-## $ Beschleunigung X-Achse : int 9
-## $ Beschleunigung Y-Achse : int 9
-## $ Beschleunigung Z-Achse : int 9
-## $ Feinstaub PM25 : int 9
-## $ gefühlte Temperatur : int 9
-## $ Luftdruck absolut : int 9
-## $ Luftdruck relativ : int 9
-## $ Regenrate : int 9
-## $ Sonnenstrahlung : int 9
-## $ Geschwindigkeit : int 8
-## $ humidity : int 8
-## $ Bodenfeuchtigkeit : int 7
-## $ Baterie : int 6
-## $ Bodenfeuchte 10cm : int 6
-## $ Bodenfeuchte 30cm : int 6
-## $ Bodentemperatur 10cm : int 6
-## $ Bodentemperatur 30cm : int 6
-## $ Lumina : int 6
-## $ Taupunktdifferenz (Spread) : int 6
-## $ Umiditate : int 6
-## $ UV : int 6
-## $ Air pressure : int 5
-## $ Battery : int 5
-## $ Höhe (barometrisch) : int 5
-## $ Pegel : int 5
-## $ PM4 : int 5
-## $ Prezenta-foc : int 5
-## $ Regenmenge : int 5
-## $ W-LAN : int 5
-## $ absolute Luftfeuchtigkeit : int 4
-## $ Bodenfeuchte 1 : int 4
-## $ Bodenfeuchte 2 : int 4
-## $ Bodentemperatur 1 : int 4
-## $ Bodentemperatur 2 : int 4
-## $ CO2 Konzentration : int 4
-## $ Gesamt Gewicht : int 4
-## $ PM 2.5 : int 4
-## $ Température : int 4
-## $ Wilgotność : int 4
-## $ Windspeed : int 4
-## $ air pressure : int 3
-## $ Battery Voltage : int 3
-## $ Befehl : int 3
-## $ Bodenfeuchte 50cm : int 3
-## $ Bodentemperatur 60cm : int 3
-## $ Ciśnienie : int 3
-## $ Feinstaub : int 3
-## $ Höhe : int 3
-## $ Humedad : int 3
-## $ Laufzeit : int 3
-## $ LuftfeuchteBME : int 3
-## [list output truncated]
-Thats quite some noise there, with many phenomena being measured by a -single sensor only, or many duplicated phenomena due to slightly -different spellings. We should clean that up, but for now let’s just -filter out the noise and find those phenomena with high sensor -numbers:
-phenoms[phenoms > 50]
-## $Temperatur
-## [1] 1543
-##
-## $`rel. Luftfeuchte`
-## [1] 1306
-##
-## $PM10
-## [1] 1080
-##
-## $PM2.5
-## [1] 1079
-##
-## $Luftdruck
-## [1] 1004
-##
-## $Beleuchtungsstärke
-## [1] 290
-##
-## $`UV-Intensität`
-## [1] 290
-##
-## $VOC
-## [1] 224
-##
-## $Lufttemperatur
-## [1] 208
-##
-## $`CO₂`
-## [1] 179
-##
-## $Bodenfeuchte
-## [1] 173
-##
-## $Temperature
-## [1] 167
-##
-## $Lautstärke
-## [1] 136
-##
-## $Luftfeuchte
-## [1] 134
-##
-## $Humidity
-## [1] 124
-##
-## $`atm. Luftdruck`
-## [1] 114
-##
-## $Kalibrierungswert
-## [1] 108
-##
-## $CO2eq
-## [1] 107
-##
-## $IAQ
-## [1] 107
-##
-## $`rel. Luftfeuchte SCD30`
-## [1] 107
-##
-## $`Temperatur SCD30`
-## [1] 107
-##
-## $Pressure
-## [1] 98
-##
-## $Luftfeuchtigkeit
-## [1] 61
-##
-## $Bodentemperatur
-## [1] 58
-##
-## $PM01
-## [1] 58
-By looking at the createdAt
attribute of each box we
-know the exact time a box was registered. Because of some database
-migration issues the createdAt
values are mostly wrong
-(~80% of boxes created 2022-03-30), so we are using the
-timestamp
attribute of the currentlocation
-which should in most cases correspond to the creation date.
With this approach we have no information about boxes that were -deleted in the meantime, but that’s okay for now.
-exposure_counts = boxes %>%
- group_by(exposure) %>%
- mutate(count = row_number(locationtimestamp))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
- geom_line() +
- scale_colour_manual(values = exposure_colors) +
- xlab('Registration Date') + ylab('senseBox count')
-
-Outdoor boxes are growing fast! We can also see the
-introduction of mobile
sensor “stations” in 2017.
Let’s have a quick summary:
-exposure_counts %>%
- summarise(
- oldest = min(locationtimestamp),
- newest = max(locationtimestamp),
- count = max(count)
- ) %>%
- arrange(desc(count))
-exposure | -oldest | -newest | -count | -
---|---|---|---|
outdoor | -2022-01-01 11:59:16 | -2022-12-30 09:19:46 | -1398 | -
indoor | -2022-01-02 11:06:08 | -2022-12-23 20:46:45 | -532 | -
mobile | -2022-01-06 13:20:00 | -2022-12-21 21:35:16 | -201 | -
unknown | -2022-03-01 07:04:30 | -2022-03-01 07:04:30 | -1 | -
We can try to find out where the increases in growth came from, by -analysing the box count by grouptag.
-Caveats: Only a small subset of boxes has a grouptag, and we should
-assume that these groups are actually bigger. Also, we can see that
-grouptag naming is inconsistent (Luftdaten
,
-luftdaten.info
, …)
grouptag_counts = boxes %>%
- group_by(grouptag) %>%
- # only include grouptags with 20 or more members
- filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
- mutate(count = row_number(locationtimestamp))
-## Warning: There were 33 warnings in `filter()`.
-## The first warning was:
-## ℹ In argument: `length(grouptag) >= 15 && !is.na(grouptag) && grouptag != ""`.
-## ℹ In group 11: `grouptag = "321heiss"`.
-## Caused by warning in `length(grouptag) >= 15 && !is.na(grouptag)`:
-## ! 'length(x) = 91 > 1' in coercion to 'logical(1)'
-## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 32 remaining warnings.
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
- lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
- factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
- geom_line(aes(group = grouptag)) +
- xlab('Registration Date') + ylab('senseBox count')
-
-grouptag_counts %>%
- summarise(
- oldest = min(locationtimestamp),
- newest = max(locationtimestamp),
- count = max(count)
- ) %>%
- arrange(desc(count))
-grouptag | -oldest | -newest | -count | -
---|---|---|---|
edu | -2022-01-02 11:06:08 | -2022-12-18 12:38:27 | -130 | -
HU Explorers | -2022-04-01 09:07:41 | -2022-12-14 10:11:34 | -128 | -
321heiss | -2022-07-09 01:29:37 | -2022-09-01 06:27:35 | -91 | -
Captographies | -2022-06-03 11:25:27 | -2022-11-16 13:26:39 | -58 | -
SUGUCS | -2022-11-30 15:25:32 | -2022-12-03 10:11:41 | -23 | -
SekSeeland | -2022-03-14 13:17:17 | -2022-03-22 20:23:58 | -19 | -
BurgerMeetnet | -2022-01-24 15:33:19 | -2022-05-10 21:22:35 | -16 | -
AGIN | -2022-11-28 17:33:12 | -2022-11-28 17:42:18 | -15 | -
BRGL | -2022-11-06 19:23:43 | -2022-11-06 22:08:36 | -15 | -
BRGW | -2022-11-02 10:28:52 | -2022-11-02 13:32:12 | -15 | -
Burgermeetnet | -2022-01-15 20:43:16 | -2022-02-11 17:59:05 | -15 | -
HTLJ | -2022-11-21 22:04:17 | -2022-11-21 22:05:47 | -15 | -
Mikroprojekt Mitmachklima | -2022-02-09 10:28:40 | -2022-08-23 13:14:11 | -15 | -
MSGB | -2022-11-14 09:08:57 | -2022-11-14 10:19:24 | -15 | -
MSHO | -2022-12-20 09:28:40 | -2022-12-20 10:01:38 | -15 | -
MSIN | -2022-11-21 17:02:39 | -2022-11-21 23:06:22 | -15 | -
First we group the boxes by locationtimestamp
into bins
-of one week:
bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
- mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
- group_by(week) %>%
- summarize(count = length(week)) %>%
- mutate(event = 'registered')
-We can do the same for updatedAt
, which informs us about
-the last change to a box, including uploaded measurements. As a lot of
-boxes were “updated” by the database migration, many of them are updated
-at 2022-03-30, so we try to use the lastMeasurement
-attribute instead of updatedAt
. This leads to fewer boxes
-but also automatically excludes boxes which were created but never made
-a measurement.
This method of determining inactive boxes is fairly inaccurate and -should be considered an approximation, because we have no information -about intermediate inactive phases. Also deleted boxes would probably -have a big impact here.
-inactive = boxes %>%
- # remove boxes that were updated in the last two days,
- # b/c any box becomes inactive at some point by definition of updatedAt
- filter(lastMeasurement < now() - days(2)) %>%
- mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
- filter(as.Date(week) > as.Date("2021-12-31")) %>%
- group_by(week) %>%
- summarize(count = length(week)) %>%
- mutate(event = 'inactive')
-Now we can combine both datasets for plotting:
-boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
- xlab('Time') + ylab(paste('rate per ', bins)) +
- scale_x_date(date_breaks="years", date_labels="%Y") +
- scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
- geom_point(aes(y = count), size = 0.5) +
- # moving average, make first and last value NA (to ensure identical length of vectors)
- geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-
-And see in which weeks the most boxes become (in)active:
-boxes_by_date %>%
- filter(count > 50) %>%
- arrange(desc(count))
-week | -count | -event | -
---|---|---|
2022-11-21 | -93 | -registered | -
2022-06-06 | -77 | -registered | -
2022-08-29 | -76 | -registered | -
2022-10-31 | -72 | -registered | -
2022-11-14 | -68 | -registered | -
2022-11-28 | -66 | -registered | -
2022-08-22 | -61 | -registered | -
2022-02-28 | -57 | -registered | -
2022-12-12 | -56 | -registered | -
2022-08-29 | -56 | -inactive | -
2022-03-21 | -54 | -registered | -
2022-01-24 | -51 | -registered | -
2022-03-07 | -51 | -registered | -
While we are looking at locationtimestamp
and
-lastMeasurement
, we can also extract the duration of
-activity of each box, and look at metrics by exposure and grouptag once
-more:
durations = boxes %>%
- group_by(exposure) %>%
- filter(!is.na(lastMeasurement)) %>%
- mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
- filter(duration >= 0)
-
-ggplot(durations, aes(x = exposure, y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days')
-
-The time of activity averages at only 130 days, though there are -boxes with 395 days of activity, spanning a large chunk of -openSenseMap’s existence.
-durations = boxes %>%
- filter(!is.na(lastMeasurement)) %>%
- group_by(grouptag) %>%
- # only include grouptags with 20 or more members
- filter(length(grouptag) >= 15 && !is.na(grouptag) && !is.na(lastMeasurement)) %>%
- mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
- filter(duration >= 0)
-## Warning: There were 21 warnings in `filter()`.
-## The first warning was:
-## ℹ In argument: `length(grouptag) >= 15 && !is.na(grouptag) && ...`.
-## ℹ In group 11: `grouptag = "321heiss"`.
-## Caused by warning in `length(grouptag) >= 15 && !is.na(grouptag)`:
-## ! 'length(x) = 81 > 1' in coercion to 'logical(1)'
-## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 20 remaining warnings.
-ggplot(durations, aes(x = grouptag, y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days')
-
-durations %>%
- summarize(
- duration_avg = round(mean(duration)),
- duration_min = round(min(duration)),
- duration_max = round(max(duration)),
- oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
- ) %>%
- arrange(desc(duration_avg))
-grouptag | -duration_avg | -duration_min | -duration_max | -oldest_box | -
---|---|---|---|---|
Burgermeetnet | -225 days | -0 days | -381 days | -381 days | -
BurgerMeetnet | -160 days | -0 days | -372 days | -372 days | -
Captographies | -109 days | -0 days | -238 days | -240 days | -
BRGL | -85 days | -81 days | -86 days | -86 days | -
edu | -83 days | -0 days | -385 days | -389 days | -
MSGB | -72 days | -39 days | -78 days | -78 days | -
HTLJ | -65 days | -30 days | -71 days | -71 days | -
MSHO | -41 days | -36 days | -42 days | -42 days | -
HU Explorers | -28 days | -0 days | -189 days | -305 days | -
321heiss | -0 days | -0 days | -0 days | -207 days | -
The time of activity averages at only 61 days, though there are boxes -with 385 days of activity, spanning a large chunk of openSenseMap’s -existence.
-This is less useful, as older boxes are active for a longer time by -definition. If you have an idea how to compensate for that, please send -a Pull -Request!
-# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
- mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
- group_by(year) %>%
- filter(!is.na(lastMeasurement)) %>%
- mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
- filter(duration >= 0)
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
- geom_boxplot() +
- coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-
-Other visualisations come to mind, and are left as an exercise to the -reader. If you implemented some, feel free to add them to this vignette -via a Pull -Request.
-This package provides data ingestion functions for almost any data stored on the open data platform for environemental sensordata https://opensensemap.org. Its main goals are to provide means for:
-Before we look at actual observations, lets get a grasp of the openSenseMap datasets’ structure.
-library(magrittr)
-library(opensensmapr)
-
-all_sensors = osem_boxes()
summary(all_sensors)
## boxes total: 1781
-##
-## boxes by exposure:
-## indoor mobile outdoor unknown
-## 290 55 1416 20
-##
-## boxes by model:
-## custom homeEthernet homeEthernetFeinstaub
-## 336 92 49
-## homeWifi homeWifiFeinstaub luftdaten_pms1003
-## 193 144 1
-## luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 luftdaten_pms7003_bme280
-## 1 5 2
-## luftdaten_sds011 luftdaten_sds011_bme280 luftdaten_sds011_bmp180
-## 57 197 19
-## luftdaten_sds011_dht11 luftdaten_sds011_dht22
-## 46 639
-##
-## $last_measurement_within
-## 1h 1d 30d 365d never
-## 929 954 1091 1428 235
-##
-## oldest box: 2014-05-28 15:36:14 (CALIMERO)
-## newest box: 2018-05-26 10:29:27 (UOS_DDI)
-##
-## sensors per box:
-## Min. 1st Qu. Median Mean 3rd Qu. Max.
-## 1.0 4.0 4.0 4.6 5.0 33.0
-This gives a good overview already: As of writing this, there are more than 700 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago.
-Another feature of interest is the spatial distribution of the boxes: plot()
can help us out here. This function requires a bunch of optional dependencies though.
if (!require('maps')) install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos')) install.packages('rgeos')
-
-plot(all_sensors)
It seems we have to reduce our area of interest to Germany.
-But what do these sensor stations actually measure? Lets find out. osem_phenomena()
gives us a named list of of the counts of each observed phenomenon for the given set of sensor stations:
phenoms = osem_phenomena(all_sensors)
-str(phenoms)
## List of 433
-## $ Temperatur : int 1608
-## $ rel. Luftfeuchte : int 1422
-## $ PM10 : int 1200
-## $ PM2.5 : int 1198
-## $ Luftdruck : int 825
-## $ Beleuchtungsstärke : int 481
-## $ UV-Intensität : int 472
-## $ Luftfeuchtigkeit : int 84
-## $ Temperature : int 49
-## $ Humidity : int 42
-## $ Helligkeit : int 25
-## $ Lautstärke : int 21
-## $ Schall : int 20
-## $ UV : int 20
-## $ Pressure : int 19
-## $ Licht : int 18
-## $ Luftfeuchte : int 14
-## $ Umgebungslautstärke : int 14
-## $ Lämpötila : int 13
-## $ Ilmanpaine : int 12
-## $ Signal : int 12
-## $ Feinstaub PM10 : int 10
-## $ Feinstaub PM2.5 : int 9
-## $ Kosteus : int 8
-## $ Temperatur DHT22 : int 8
-## $ Valonmäärä : int 8
-## $ temperature : int 8
-## $ PM01 : int 7
-## $ UV-säteily : int 7
-## $ Niederschlag : int 6
-## $ UV-Strahlung : int 6
-## $ Wind speed : int 6
-## $ Windgeschwindigkeit : int 6
-## $ humidity : int 6
-## $ Ilmankosteus : int 5
-## $ Wassertemperatur : int 5
-## $ Windrichtung : int 5
-## $ rel. Luftfeuchtigkeit : int 5
-## $ Druck : int 4
-## $ Light : int 4
-## $ Temperature 1 : int 4
-## $ UV Index : int 4
-## $ UV-Säteily : int 4
-## $ lautstärke : int 4
-## $ rel. Luftfeuchte 1 : int 4
-## $ rel. Luftfeuchte DHT22 : int 4
-## $ relative Luftfeuchtigkeit : int 4
-## $ Air pressure : int 3
-## $ Batterie : int 3
-## $ Battery : int 3
-## $ DS18B20_Probe01 : int 3
-## $ DS18B20_Probe02 : int 3
-## $ DS18B20_Probe03 : int 3
-## $ DS18B20_Probe04 : int 3
-## $ DS18B20_Probe05 : int 3
-## $ Licht (digital) : int 3
-## $ Luftdruck (BME280) : int 3
-## $ PM 10 : int 3
-## $ PM 2.5 : int 3
-## $ Temp : int 3
-## $ Temperatur (BME280) : int 3
-## $ Temperatur HDC1008 : int 3
-## $ Temperatura : int 3
-## $ Temperature 2 : int 3
-## $ UV-Index : int 3
-## $ Valoisuus : int 3
-## $ Wind Gust : int 3
-## $ pressure : int 3
-## $ 1 : int 2
-## $ 10 : int 2
-## $ 2 : int 2
-## $ 3 : int 2
-## $ 4 : int 2
-## $ 5 : int 2
-## $ 6 : int 2
-## $ 7 : int 2
-## $ 8 : int 2
-## $ 9 : int 2
-## $ Air Pressure : int 2
-## $ Anderer : int 2
-## $ Battery voltage : int 2
-## $ CO2 : int 2
-## $ Feuchte : int 2
-## $ Illuminance : int 2
-## $ Intensity : int 2
-## $ Leitfähigkeit : int 2
-## $ Lichtintensität : int 2
-## $ Luftdruck BMP180 : int 2
-## $ Luftfeuchte (BME280) : int 2
-## $ Luftqualität : int 2
-## $ Lufttemperatur : int 2
-## $ PM25 : int 2
-## $ Radioactivity : int 2
-## $ Radioaktivität : int 2
-## $ Regen : int 2
-## $ Relative Humidity : int 2
-## $ Sound : int 2
-## $ Temperatur (DHT22) : int 2
-## $ Temperatur BMP180 : int 2
-## [list output truncated]
-Thats quite some noise there, with many phenomena being measured by a single sensor only, or many duplicated phenomena due to slightly different spellings. We should clean that up, but for now let’s just filter out the noise and find those phenomena with high sensor numbers:
-phenoms[phenoms > 20]
## $Temperatur
-## [1] 1608
-##
-## $`rel. Luftfeuchte`
-## [1] 1422
-##
-## $PM10
-## [1] 1200
-##
-## $PM2.5
-## [1] 1198
-##
-## $Luftdruck
-## [1] 825
-##
-## $Beleuchtungsstärke
-## [1] 481
-##
-## $`UV-Intensität`
-## [1] 472
-##
-## $Luftfeuchtigkeit
-## [1] 84
-##
-## $Temperature
-## [1] 49
-##
-## $Humidity
-## [1] 42
-##
-## $Helligkeit
-## [1] 25
-##
-## $Lautstärke
-## [1] 21
-Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more interesting to analyze though. We should check how many sensor stations provide useful data: We want only those boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting measurements:
-pm25_sensors = osem_boxes(
- exposure = 'outdoor',
- date = Sys.time(), # ±4 hours
- phenomenon = 'PM2.5'
-)
summary(pm25_sensors)
## boxes total: 791
-##
-## boxes by exposure:
-## outdoor
-## 791
-##
-## boxes by model:
-## custom homeEthernetFeinstaub homeWifi
-## 29 37 6
-## homeWifiFeinstaub luftdaten_pms1003_bme280 luftdaten_pms5003_bme280
-## 57 1 1
-## luftdaten_pms7003_bme280 luftdaten_sds011 luftdaten_sds011_bme280
-## 2 32 137
-## luftdaten_sds011_bmp180 luftdaten_sds011_dht11 luftdaten_sds011_dht22
-## 14 32 443
-##
-## $last_measurement_within
-## 1h 1d 30d 365d never
-## 771 780 784 789 2
-##
-## oldest box: 2016-06-02 12:09:47 (BalkonBox Mindener Str.)
-## newest box: 2018-05-24 20:29:50 (Stadthalle)
-##
-## sensors per box:
-## Min. 1st Qu. Median Mean 3rd Qu. Max.
-## 2.000 4.000 4.000 4.617 5.000 12.000
-plot(pm25_sensors)
Thats still more than 200 measuring stations, we can work with that.
-Having analyzed the available data sources, let’s finally get some measurements. We could call osem_measurements(pm25_sensors)
now, however we are focussing on a restricted area of interest, the city of Berlin. Luckily we can get the measurements filtered by a bounding box:
library(sf)
## Linking to GEOS 3.5.1, GDAL 2.2.2, proj.4 4.9.2
-library(units)
##
-## Attaching package: 'units'
-## The following object is masked from 'package:base':
-##
-## %*%
-library(lubridate)
-library(dplyr)
-
-# construct a bounding box: 12 kilometers around Berlin
-berlin = st_point(c(13.4034, 52.5120)) %>%
- st_sfc(crs = 4326) %>%
- st_transform(3857) %>% # allow setting a buffer in meters
- st_buffer(set_units(12, km)) %>%
- st_transform(4326) %>% # the opensensemap expects WGS 84
- st_bbox()
pm25 = osem_measurements(
- berlin,
- phenomenon = 'PM2.5',
- from = now() - days(20), # defaults to 2 days
- to = now()
-)
-
-plot(pm25)
Now we can get started with actual spatiotemporal data analysis. First, lets mask the seemingly uncalibrated sensors:
-outliers = filter(pm25, value > 100)$sensorId
-bad_sensors = outliers[, drop = T] %>% levels()
-
-pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
Then plot the measuring locations, flagging the outliers:
-st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
Removing these sensors yields a nicer time series plot:
-pm25 %>% filter(invalid == FALSE) %>% plot()
Further analysis: comparison with LANUV data TODO
It may be useful to download data from openSenseMap only once. For reproducible results, the data should be saved to disk, and reloaded at a later point.
-This avoids..
-This vignette shows how to use this built in opensensmapr
feature, and how to do it yourself in case you want to save to other data formats.
# this vignette requires:
-library(opensensmapr)
-library(jsonlite)
-library(readr)
All data retrieval functions of opensensmapr
have a built in caching feature, which serializes an API response to disk. Subsequent identical requests will then return the serialized data instead of making another request.
To use this feature, just add a path to a directory to the cache
parameter:
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# requests without the cache parameter will still be performed normally
-b = osem_boxes(grouptag = 'ifgi')
Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:
-list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
## [1] "osemcache.17db5c57fc6fca4d836fa2cf30345ce8767cd61a.rds"
-You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:
-cacheDir = getwd() # current working directory
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
To get fresh results again, just call osem_clear_cache()
for the respective cache:
osem_clear_cache() # clears default cache
-osem_clear_cache(getwd()) # clears a custom cache
If you want to roll your own serialization method to support custom data formats, here’s how:
-# first get our example data:
-measurements = osem_measurements('Windrichtung')
If you are paranoid and worry about .rds
files not being decodable anymore in the (distant) future, you could serialize to a plain text format such as JSON. This of course comes at the cost of storage space and performance.
# serializing senseBoxes to JSON, and loading from file again:
-write(jsonlite::serializeJSON(measurements), 'measurements.json')
-measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
-class(measurements_from_file)
## [1] "osem_measurements" "tbl_df" "tbl"
-## [4] "data.frame"
-This method also persists the R object metadata (classes, attributes). If you were to use a serialization method that can’t persist object metadata, you could re-apply it with the following functions:
-# note the toJSON call instead of serializeJSON
-write(jsonlite::toJSON(measurements), 'measurements_bad.json')
-measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
-class(measurements_without_attrs)
## [1] "data.frame"
-measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
-class(measurements_with_attrs)
## [1] "osem_measurements" "tbl_df" "tbl"
-## [4] "data.frame"
-The same goes for boxes via osem_as_sensebox()
.