diff --git a/inst/doc/osem-history.R b/inst/doc/osem-history.R new file mode 100644 index 0000000..cda0dcb --- /dev/null +++ b/inst/doc/osem-history.R @@ -0,0 +1,133 @@ +## ----setup, results='hide', message=FALSE, warning=FALSE---------------------- +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() + +## ----download----------------------------------------------------------------- +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes = osem_boxes() + +## ----exposure_counts, message=FALSE------------------------------------------- +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(createdAt)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') + +## ----exposure_summary--------------------------------------------------------- +exposure_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----grouptag_counts, message=FALSE------------------------------------------- +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 & !is.na(grouptag)) %>% + mutate(count = row_number(createdAt)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') + +## ----grouptag_summary--------------------------------------------------------- +grouptag_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------ +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') + +## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'-------- +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(updatedAt < now() - days(2)) %>% + mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') + +## ----growthrate, warning=FALSE, message=FALSE, results='hide'----------------- +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) + +## ----exposure_duration, message=FALSE----------------------------------------- +duration = boxes %>% + group_by(exposure) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +## ----grouptag_duration, message=FALSE----------------------------------------- +duration = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +duration %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), createdAt, units='days'))) + ) %>% + arrange(desc(duration_avg)) + +## ----year_duration, message=FALSE--------------------------------------------- +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') + diff --git a/inst/doc/osem-history.Rmd b/inst/doc/osem-history.Rmd new file mode 100644 index 0000000..5a6ab21 --- /dev/null +++ b/inst/doc/osem-history.Rmd @@ -0,0 +1,243 @@ +--- +title: "Visualising the History of openSenseMap.org" +author: "Norwin Roosen" +date: '`r Sys.Date()`' +output: + rmarkdown::html_vignette: + df_print: kable + fig_height: 5 + fig_width: 7 + toc: yes + html_document: + code_folding: hide + df_print: kable + theme: lumen + toc: yes + toc_float: yes +vignette: > + %\VignetteIndexEntry{Visualising the History of openSenseMap.org} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +> This vignette serves as an example on data wrangling & visualization with +`opensensmapr`, `dplyr` and `ggplot2`. + +```{r setup, results='hide', message=FALSE, warning=FALSE} +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() +``` + +openSenseMap.org has grown quite a bit in the last years; it would be interesting +to see how we got to the current `r osem_counts()$boxes` sensor stations, +split up by various attributes of the boxes. + +While `opensensmapr` provides extensive methods of filtering boxes by attributes +on the server, we do the filtering within R to save time and gain flexibility. +So the first step is to retrieve *all the boxes*: + +```{r download} +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes = osem_boxes() +``` + +# Plot count of boxes by time {.tabset} +By looking at the `createdAt` attribute of each box we know the exact time a box +was registered. +With this approach we have no information about boxes that were deleted in the +meantime, but that's okay for now. + +## ...and exposure +```{r exposure_counts, message=FALSE} +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(createdAt)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') +``` + +Outdoor boxes are growing *fast*! +We can also see the introduction of `mobile` sensor "stations" in 2017. While +mobile boxes are still few, we can expect a quick rise in 2018 once the new +senseBox MCU with GPS support is released. + +Let's have a quick summary: +```{r exposure_summary} +exposure_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +## ...and grouptag +We can try to find out where the increases in growth came from, by analysing the +box count by grouptag. + +Caveats: Only a small subset of boxes has a grouptag, and we should assume +that these groups are actually bigger. Also, we can see that grouptag naming is +inconsistent (`Luftdaten`, `luftdaten.info`, ...) + +```{r grouptag_counts, message=FALSE} +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 & !is.na(grouptag)) %>% + mutate(count = row_number(createdAt)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') +``` + +```{r grouptag_summary} +grouptag_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +# Plot rate of growth and inactivity per week +First we group the boxes by `createdAt` into bins of one week: +```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') +``` + +We can do the same for `updatedAt`, which informs us about the last change to +a box, including uploaded measurements. +This method of determining inactive boxes is fairly inaccurate and should be +considered an approximation, because we have no information about intermediate +inactive phases. +Also deleted boxes would probably have a big impact here. +```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(updatedAt < now() - days(2)) %>% + mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') +``` + +Now we can combine both datasets for plotting: +```{r growthrate, warning=FALSE, message=FALSE, results='hide'} +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) +``` + +We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`. +This was enabled by an integration of openSenseMap.org into the firmware of the +air quality monitoring project [luftdaten.info](https://sensor.community/de/). +The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues +of the senseBox hardware, but I have no data on the exact time frames to verify. + +# Plot duration of boxes being active {.tabset} +While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity +of each box, and look at metrics by exposure and grouptag once more: + +## ...by exposure +```{r exposure_duration, message=FALSE} +duration = boxes %>% + group_by(exposure) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') +``` + +The time of activity averages at only `r round(mean(duration$duration))` days, +though there are boxes with `r round(max(duration$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by grouptag +```{r grouptag_duration, message=FALSE} +duration = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +duration %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), createdAt, units='days'))) + ) %>% + arrange(desc(duration_avg)) +``` + +The time of activity averages at only `r round(mean(duration$duration))` days, +though there are boxes with `r round(max(duration$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by year of registration +This is less useful, as older boxes are active for a longer time by definition. +If you have an idea how to compensate for that, please send a [Pull Request][PR]! + +```{r year_duration, message=FALSE} +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') +``` + +# More Visualisations +Other visualisations come to mind, and are left as an exercise to the reader. +If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. + +* growth by phenomenon +* growth by location -> (interactive) map +* set inactive rate in relation to total box count +* filter timespans with big dips in growth rate, and extrapolate the amount of + senseBoxes that could be on the platform today, assuming there were no production issues ;) + +[PR]: https://github.com/sensebox/opensensmapr/pulls diff --git a/inst/doc/osem-history.html b/inst/doc/osem-history.html new file mode 100644 index 0000000..4f7b3d5 --- /dev/null +++ b/inst/doc/osem-history.html @@ -0,0 +1,1818 @@ + + + + + + + + + + + + + + + + +Visualising the History of openSenseMap.org + + + + + + + + + + + + + + + + + + + + + + + + + + +

Visualising the History of +openSenseMap.org

+

Norwin Roosen

+

2023-02-23

+ + +
+ +
+ +
+

This vignette serves as an example on data wrangling & +visualization with opensensmapr, dplyr and +ggplot2.

+
+
# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+

openSenseMap.org has grown quite a bit in the last years; it would be +interesting to see how we got to the current 11367 sensor stations, +split up by various attributes of the boxes.

+

While opensensmapr provides extensive methods of +filtering boxes by attributes on the server, we do the filtering within +R to save time and gain flexibility. So the first step is to retrieve +all the boxes:

+
# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes = osem_boxes()
+
+

Plot count of boxes by time

+

By looking at the createdAt attribute of each box we +know the exact time a box was registered. With this approach we have no +information about boxes that were deleted in the meantime, but that’s +okay for now.

+
+

…and exposure

+
exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(createdAt))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+

Outdoor boxes are growing fast! We can also see the +introduction of mobile sensor “stations” in 2017. While +mobile boxes are still few, we can expect a quick rise in 2018 once the +new senseBox MCU with GPS support is released.

+

Let’s have a quick summary:

+
exposure_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
exposureoldestnewestcount
outdoor2016-08-09 19:34:422023-02-23 07:56:598413
indoor2018-05-10 20:14:442023-02-21 14:23:522344
mobile2020-10-24 14:39:302023-02-20 16:32:48591
unknown2022-03-01 07:04:312022-03-30 11:25:4319
+
+
+
+

…and grouptag

+

We can try to find out where the increases in growth came from, by +analysing the box count by grouptag.

+

Caveats: Only a small subset of boxes has a grouptag, and we should +assume that these groups are actually bigger. Also, we can see that +grouptag naming is inconsistent (Luftdaten, +luftdaten.info, …)

+
grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
+  mutate(count = row_number(createdAt))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+
grouptag_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagoldestnewestcount
edu2022-03-30 11:25:432023-02-20 11:06:45430
Save Dnipro2022-03-30 11:25:432022-03-30 11:25:43354
Luftdaten2022-03-30 11:25:432023-01-27 15:22:54244
HU Explorers2022-03-30 11:25:432022-12-14 10:11:34124
CS:iDrop2023-01-10 10:22:332023-01-31 15:13:46120
#stropdeaer2022-03-30 11:25:432022-03-30 11:25:43101
321heiss2022-06-27 14:12:252022-08-08 10:22:2191
GIZ Clean Air Day Project2022-03-30 11:25:432022-03-30 11:25:4376
Captographies2021-05-21 15:24:452023-01-31 12:11:4962
Futurium2022-03-30 11:25:432022-03-30 11:25:4340
Bad_Hersfeld2022-03-30 11:25:432022-03-30 11:25:4337
TKS Bonn2022-03-30 11:25:432022-03-30 11:25:4336
Mikroprojekt Mitmachklima2022-03-30 11:25:432022-08-23 13:14:1134
kerekdomb_2022-03-30 11:25:432022-03-30 11:25:4334
Bottrop-Feinstaub2022-03-30 11:25:432022-03-30 11:25:4333
Luchtwachters Delft2022-03-30 11:25:432022-03-30 11:25:4333
Futurium 20212022-03-30 11:25:432022-03-30 11:25:4332
Feinstaub2022-03-30 11:25:432022-08-01 16:27:1029
luftdaten.info2022-03-30 11:25:432022-03-30 11:25:4328
ifgi2022-03-30 11:25:432022-03-30 11:25:4326
SUGUCS2022-11-30 15:25:322023-01-23 13:17:5425
cleanairfrome2022-03-30 11:25:432022-05-15 21:13:3024
WAUW!denberg2022-03-30 11:25:432022-03-30 11:25:4323
freshairbromley2022-03-30 11:25:432023-01-31 10:18:5723
Riga2022-03-30 11:25:432022-03-30 11:25:4322
KJR-M2022-03-30 11:25:432022-03-30 11:25:4321
Mikroklima2022-03-30 11:25:432022-09-05 08:38:5721
bad_hersfeld2022-03-30 11:25:432022-06-14 09:34:0221
Smart City MS2022-03-30 11:25:432022-03-30 11:25:4320
SekSeeland2022-03-30 11:25:432022-03-30 11:25:4319
Luftdaten.info2022-03-30 11:25:432022-03-30 11:25:4318
12022-03-30 11:25:432022-04-25 15:07:3917
Apeldoorn2022-03-30 11:25:432022-03-30 11:25:4317
luftdaten2022-03-30 11:25:432022-03-30 11:25:4317
BurgerMeetnet2022-03-30 11:25:432022-05-10 21:22:3516
Haus C2022-03-30 11:25:432022-03-30 11:25:4316
AGIN2022-11-28 17:33:122022-11-28 17:42:1815
APPI2023-01-26 13:38:222023-01-26 13:40:5915
BRGL2022-11-06 19:23:432022-11-06 22:08:3615
BRGW2022-11-02 10:28:522022-11-02 13:32:1215
Burgermeetnet2022-03-30 11:25:432022-03-30 11:25:4315
HTLJ2022-11-21 22:04:172022-11-21 22:05:4715
MSGB2022-11-14 09:08:572022-11-14 10:19:2415
MSHO2022-12-20 09:28:402022-12-20 10:01:3815
MSIN2022-11-21 17:02:392022-11-21 23:06:2215
MSKE2023-01-05 15:40:582023-01-05 15:52:0215
MakeLight2022-03-30 11:25:432022-03-30 11:25:4315
PMSI2023-01-20 14:22:032023-01-20 14:31:5215
Haus B2022-03-30 11:25:432022-03-30 11:25:4314
UrbanGarden2023-02-02 19:27:402023-02-18 14:50:1914
Соседи по воздуху2022-03-30 11:25:432023-01-27 09:50:4314
PIE2022-03-30 11:25:432022-03-30 11:25:4313
RB-DSJ2022-03-30 11:25:432022-03-30 11:25:4313
Sofia2022-03-30 11:25:432022-03-30 11:25:4312
co2mofetten2022-03-30 11:25:432023-01-17 07:38:2112
Haus D2022-03-30 11:25:432022-03-30 11:25:4311
Netlight2022-03-30 11:25:432022-03-30 11:25:4311
home2022-03-30 11:25:432022-03-30 11:25:4311
#STROPDEAER2022-03-30 11:25:432023-02-16 15:12:5010
AirAberdeen2022-03-30 11:25:432022-03-30 11:25:4310
Balthasar-Neumann-Schule 12022-03-30 11:25:432022-03-30 11:25:4310
Bestäuberprojekt2022-03-30 11:25:432022-03-30 11:25:4310
Che Aria Tira?2022-03-30 11:25:432022-03-30 11:25:4310
HBG Bonn2022-03-30 11:25:432022-03-30 11:25:4310
IntegrA2022-03-30 11:25:432022-03-30 11:25:4310
Mikroklima H2022-05-07 17:29:002022-05-07 17:47:4210
dwih-sp2022-03-30 11:25:432022-03-30 11:25:4310
esri-de2022-03-30 11:25:432022-03-30 11:25:4310
makerspace-partheland2022-03-30 11:25:432023-02-20 18:34:5010
montorioveronese.it2022-03-30 11:25:432022-12-29 07:45:5710
ATSO2022-03-30 11:25:432022-03-30 11:25:439
Fläming2022-08-15 19:16:482022-12-13 06:29:229
Mikroklima C-R2022-03-30 11:25:432022-03-30 11:25:439
Ostroda2022-03-30 11:25:432022-03-30 11:25:439
RSS2022-03-30 11:25:432022-03-30 11:25:439
clevermint2022-03-30 11:25:432022-03-30 11:25:439
test2022-03-30 11:25:432022-12-18 22:20:349
22022-03-30 11:25:432023-01-07 15:44:298
DBDS2022-03-30 11:25:432022-03-30 11:25:438
Data4City2022-03-30 11:25:432022-03-30 11:25:438
IKG2022-03-30 11:25:432022-03-30 11:25:438
IVKOWeek2022-03-30 11:25:432022-07-05 09:42:318
Koerber-Stiftung2022-03-30 11:25:432022-03-30 11:25:438
M72022-03-30 11:25:432022-11-28 13:00:448
Natlab Ökologie2022-03-30 11:25:432022-03-30 11:25:438
PGKN2022-03-30 11:25:432022-03-30 11:25:438
Raumanmeri2022-03-30 11:25:432022-03-30 11:25:438
stw2022-03-30 11:25:432022-03-30 11:25:438
+
+
+
+
+

Plot rate of growth and inactivity per week

+

First we group the boxes by createdAt into bins of one +week:

+
bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+

We can do the same for updatedAt, which informs us about +the last change to a box, including uploaded measurements. This method +of determining inactive boxes is fairly inaccurate and should be +considered an approximation, because we have no information about +intermediate inactive phases. Also deleted boxes would probably have a +big impact here.

+
inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(updatedAt < now() - days(2)) %>%
+  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+

Now we can combine both datasets for plotting:

+
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+

+

We see a sudden rise in early 2017, which lines up with the fast +growing grouptag Luftdaten. This was enabled by an +integration of openSenseMap.org into the firmware of the air quality +monitoring project luftdaten.info. The dips in mid +2017 and early 2018 could possibly be explained by production/delivery +issues of the senseBox hardware, but I have no data on the exact time +frames to verify.

+
+
+

Plot duration of boxes being active

+

While we are looking at createdAt and +updatedAt, we can also extract the duration of activity of +each box, and look at metrics by exposure and grouptag once more:

+
+

…by exposure

+
duration = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+

The time of activity averages at only 157 days, though there are +boxes with 2389 days of activity, spanning a large chunk of +openSenseMap’s existence.

+
+
+

…by grouptag

+
duration = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+  
+ggplot(duration, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+
duration %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), createdAt, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+
+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagduration_avgduration_minduration_maxoldest_box
Ostroda330 days330 days330 days330 days
Mikroklima C-R328 days321 days330 days330 days
Apeldoorn326 days263 days330 days330 days
freshairbromley298 days23 days330 days330 days
Mikroklima280 days42 days330 days330 days
Mikroklima H279 days229 days292 days292 days
Smart City MS272 days0 days330 days330 days
Feinstaub220 days0 days330 days330 days
co2mofetten212 days0 days330 days330 days
makerspace-partheland210 days0 days330 days330 days
Luftdaten208 days0 days330 days330 days
luftdaten.info197 days0 days330 days330 days
Burgermeetnet188 days0 days330 days330 days
esri-de188 days0 days330 days330 days
#stropdeaer185 days0 days330 days330 days
Sofia170 days0 days330 days330 days
WAUW!denberg166 days0 days330 days330 days
KJR-M165 days0 days330 days330 days
IKG162 days0 days330 days330 days
AirAberdeen153 days0 days330 days330 days
M7152 days87 days243 days330 days
1145 days0 days330 days330 days
BurgerMeetnet139 days0 days330 days330 days
Luftdaten.info138 days0 days330 days330 days
Bottrop-Feinstaub132 days0 days330 days330 days
stw129 days0 days330 days330 days
cleanairfrome128 days0 days330 days330 days
montorioveronese.it128 days0 days330 days330 days
RB-DSJ122 days0 days330 days330 days
Mikroprojekt Mitmachklima117 days0 days330 days330 days
Luchtwachters Delft109 days0 days330 days330 days
BRGL107 days85 days109 days109 days
Fläming107 days23 days175 days192 days
BRGW106 days98 days113 days113 days
PIE101 days0 days330 days330 days
Riga101 days0 days330 days330 days
kerekdomb_100 days0 days330 days330 days
luftdaten99 days0 days330 days330 days
home94 days0 days330 days330 days
Bad_Hersfeld93 days0 days330 days330 days
dwih-sp91 days0 days330 days330 days
MSGB89 days50 days101 days101 days
AGIN86 days86 days86 days87 days
HTLJ84 days58 days94 days94 days
bad_hersfeld84 days0 days330 days330 days
Соседи по воздуху84 days0 days330 days330 days
Captographies78 days0 days643 days643 days
Save Dnipro74 days0 days330 days330 days
PGKN67 days0 days330 days330 days
Netlight60 days0 days330 days330 days
MSHO57 days36 days65 days65 days
Futurium52 days0 days330 days330 days
MSIN52 days0 days79 days94 days
test52 days0 days329 days330 days
ifgi51 days0 days330 days330 days
#STROPDEAER50 days0 days330 days330 days
ATSO48 days0 days279 days330 days
246 days0 days310 days330 days
MakeLight46 days0 days330 days330 days
Haus B44 days0 days239 days330 days
Futurium 202143 days0 days329 days330 days
IVKOWeek42 days0 days330 days330 days
DBDS41 days0 days330 days330 days
GIZ Clean Air Day Project36 days0 days330 days330 days
edu36 days0 days330 days330 days
TKS Bonn32 days0 days330 days330 days
HU Explorers28 days0 days319 days330 days
321heiss24 days0 days43 days241 days
SUGUCS9 days0 days53 days85 days
APPI3 days0 days7 days28 days
MSKE3 days0 days7 days49 days
PMSI3 days0 days4 days34 days
RSS3 days0 days28 days330 days
CS:iDrop2 days0 days36 days44 days
UrbanGarden2 days0 days12 days21 days
Balthasar-Neumann-Schule 10 days0 days0 days330 days
Bestäuberprojekt0 days0 days0 days330 days
Che Aria Tira?0 days0 days0 days330 days
Data4City0 days0 days0 days330 days
HBG Bonn0 days0 days0 days330 days
Haus C0 days0 days0 days330 days
Haus D0 days0 days0 days330 days
IntegrA0 days0 days0 days330 days
Koerber-Stiftung0 days0 days0 days330 days
Natlab Ökologie0 days0 days0 days330 days
Raumanmeri0 days0 days0 days330 days
SekSeeland0 days0 days0 days330 days
clevermint0 days0 days0 days330 days
+
+

The time of activity averages at only 89 days, though there are boxes +with 643 days of activity, spanning a large chunk of openSenseMap’s +existence.

+
+
+

…by year of registration

+

This is less useful, as older boxes are active for a longer time by +definition. If you have an idea how to compensate for that, please send +a Pull +Request!

+
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+

+
+
+
+

More Visualisations

+

Other visualisations come to mind, and are left as an exercise to the +reader. If you implemented some, feel free to add them to this vignette +via a Pull +Request.

+ +
+ + + + + + + + + + + diff --git a/inst/doc/osem-history_revised.R b/inst/doc/osem-history_revised.R new file mode 100644 index 0000000..84ea635 --- /dev/null +++ b/inst/doc/osem-history_revised.R @@ -0,0 +1,162 @@ +## ----setup, results='hide', message=FALSE, warning=FALSE---------------------- +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() + +## ----download, results='hide', message=FALSE, warning=FALSE------------------- +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes_all = osem_boxes() +boxes = boxes_all + +## ----------------------------------------------------------------------------- +boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31") +summary(boxes) -> summary.data.frame + +## ----message=F, warning=F----------------------------------------------------- +if (!require('maps')) install.packages('maps') +if (!require('maptools')) install.packages('maptools') +if (!require('rgeos')) install.packages('rgeos') + +plot(boxes) + +## ----------------------------------------------------------------------------- +phenoms = osem_phenomena(boxes) +str(phenoms) + +## ----------------------------------------------------------------------------- +phenoms[phenoms > 50] + +## ----exposure_counts, message=FALSE------------------------------------------- +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(locationtimestamp)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') + +## ----exposure_summary--------------------------------------------------------- +exposure_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----grouptag_counts, message=FALSE------------------------------------------- +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 15 or more members + filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>% + mutate(count = row_number(locationtimestamp)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') + +## ----grouptag_summary--------------------------------------------------------- +grouptag_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'------ +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') + +## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'-------- +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(lastMeasurement < now() - days(2)) %>% + mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>% + filter(as.Date(week) > as.Date("2021-12-31")) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') + +## ----growthrate, warning=FALSE, message=FALSE, results='hide'----------------- +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) + +## ----table_mostregistrations-------------------------------------------------- +boxes_by_date %>% + filter(count > 50) %>% + arrange(desc(count)) + +## ----exposure_duration, message=FALSE----------------------------------------- +durations = boxes %>% + group_by(exposure) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +## ----grouptag_duration, message=FALSE----------------------------------------- +durations = boxes %>% + filter(!is.na(lastMeasurement)) %>% + group_by(grouptag) %>% + # only include grouptags with 20 or more members + filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +durations %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), locationtimestamp, units='days'))) + ) %>% + arrange(desc(duration_avg)) + +## ----year_duration, message=FALSE--------------------------------------------- +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') + diff --git a/inst/doc/osem-history_revised.Rmd b/inst/doc/osem-history_revised.Rmd new file mode 100644 index 0000000..c4cbaaa --- /dev/null +++ b/inst/doc/osem-history_revised.Rmd @@ -0,0 +1,300 @@ +--- +title: "Visualising the Development of openSenseMap.org in 2022" +author: "Jan Stenkamp" +date: '`r Sys.Date()`' +output: + html_document: + code_folding: hide + df_print: kable + theme: lumen + toc: yes + toc_float: yes + rmarkdown::html_vignette: + df_print: kable + fig_height: 5 + fig_width: 7 + toc: yes +vignette: > + %\VignetteIndexEntry{Visualising the Development of openSenseMap.org in 2022} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +--- + +> This vignette serves as an example on data wrangling & visualization with +`opensensmapr`, `dplyr` and `ggplot2`. + +```{r setup, results='hide', message=FALSE, warning=FALSE} +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() +``` + +openSenseMap.org has grown quite a bit in the last years; it would be interesting +to see how we got to the current `r osem_counts()$boxes` sensor stations, +split up by various attributes of the boxes. + +While `opensensmapr` provides extensive methods of filtering boxes by attributes +on the server, we do the filtering within R to save time and gain flexibility. + + +So the first step is to retrieve *all the boxes*. + +```{r download, results='hide', message=FALSE, warning=FALSE} +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes_all = osem_boxes() +boxes = boxes_all +``` +# Introduction +In the following we just want to have a look at the boxes created in 2022, so we filter for them. + +```{r} +boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31") +summary(boxes) -> summary.data.frame +``` + + + + + + + +Another feature of interest is the spatial distribution of the boxes: `plot()` +can help us out here. This function requires a bunch of optional dependencies though. + +```{r message=F, warning=F} +if (!require('maps')) install.packages('maps') +if (!require('maptools')) install.packages('maptools') +if (!require('rgeos')) install.packages('rgeos') + +plot(boxes) +``` + +But what do these sensor stations actually measure? Lets find out. +`osem_phenomena()` gives us a named list of of the counts of each observed +phenomenon for the given set of sensor stations: + +```{r} +phenoms = osem_phenomena(boxes) +str(phenoms) +``` + +Thats quite some noise there, with many phenomena being measured by a single +sensor only, or many duplicated phenomena due to slightly different spellings. +We should clean that up, but for now let's just filter out the noise and find +those phenomena with high sensor numbers: + +```{r} +phenoms[phenoms > 50] +``` + + +# Plot count of boxes by time {.tabset} +By looking at the `createdAt` attribute of each box we know the exact time a box +was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date. + +With this approach we have no information about boxes that were deleted in the +meantime, but that's okay for now. + +## ...and exposure +```{r exposure_counts, message=FALSE} +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(locationtimestamp)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') +``` + +Outdoor boxes are growing *fast*! +We can also see the introduction of `mobile` sensor "stations" in 2017. + +Let's have a quick summary: +```{r exposure_summary} +exposure_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +## ...and grouptag +We can try to find out where the increases in growth came from, by analysing the +box count by grouptag. + +Caveats: Only a small subset of boxes has a grouptag, and we should assume +that these groups are actually bigger. Also, we can see that grouptag naming is +inconsistent (`Luftdaten`, `luftdaten.info`, ...) + +```{r grouptag_counts, message=FALSE} +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 15 or more members + filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>% + mutate(count = row_number(locationtimestamp)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') +``` + +```{r grouptag_summary} +grouptag_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +# Plot rate of growth and inactivity per week +First we group the boxes by `locationtimestamp` into bins of one week: +```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') +``` + +We can do the same for `updatedAt`, which informs us about the last change to +a box, including uploaded measurements. As a lot of boxes were "updated" by the database +migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` +attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes +boxes which were created but never made a measurement. + +This method of determining inactive boxes is fairly inaccurate and should be +considered an approximation, because we have no information about intermediate +inactive phases. +Also deleted boxes would probably have a big impact here. +```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(lastMeasurement < now() - days(2)) %>% + mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>% + filter(as.Date(week) > as.Date("2021-12-31")) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') +``` + +Now we can combine both datasets for plotting: +```{r growthrate, warning=FALSE, message=FALSE, results='hide'} +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) +``` + +And see in which weeks the most boxes become (in)active: +```{r table_mostregistrations} +boxes_by_date %>% + filter(count > 50) %>% + arrange(desc(count)) +``` + +# Plot duration of boxes being active {.tabset} +While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity +of each box, and look at metrics by exposure and grouptag once more: + +## ...by exposure +```{r exposure_duration, message=FALSE} +durations = boxes %>% + group_by(exposure) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') +``` + +The time of activity averages at only `r round(mean(durations$duration))` days, +though there are boxes with `r round(max(durations$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by grouptag +```{r grouptag_duration, message=FALSE} +durations = boxes %>% + filter(!is.na(lastMeasurement)) %>% + group_by(grouptag) %>% + # only include grouptags with 20 or more members + filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +durations %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), locationtimestamp, units='days'))) + ) %>% + arrange(desc(duration_avg)) +``` + +The time of activity averages at only `r round(mean(durations$duration))` days, +though there are boxes with `r round(max(durations$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by year of registration +This is less useful, as older boxes are active for a longer time by definition. +If you have an idea how to compensate for that, please send a [Pull Request][PR]! + +```{r year_duration, message=FALSE} +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') +``` + +# More Visualisations +Other visualisations come to mind, and are left as an exercise to the reader. +If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. + +* growth by phenomenon +* growth by location -> (interactive) map +* set inactive rate in relation to total box count +* filter timespans with big dips in growth rate, and extrapolate the amount of + senseBoxes that could be on the platform today, assuming there were no production issues ;) + +[PR]: https://github.com/sensebox/opensensmapr/pulls + + diff --git a/inst/doc/osem-history_revised.html b/inst/doc/osem-history_revised.html new file mode 100644 index 0000000..8077d56 --- /dev/null +++ b/inst/doc/osem-history_revised.html @@ -0,0 +1,2471 @@ + + + + + + + + + + + + + + + +Visualising the Development of openSenseMap.org in 2022 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

This vignette serves as an example on data wrangling & +visualization with opensensmapr, dplyr and +ggplot2.

+
+
# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+

openSenseMap.org has grown quite a bit in the last years; it would be +interesting to see how we got to the current 11367 sensor stations, +split up by various attributes of the boxes.

+

While opensensmapr provides extensive methods of +filtering boxes by attributes on the server, we do the filtering within +R to save time and gain flexibility.

+

So the first step is to retrieve all the boxes.

+
# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes_all = osem_boxes()
+boxes = boxes_all
+
+

Introduction

+

In the following we just want to have a look at the boxes created in +2022, so we filter for them.

+
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
+summary(boxes) -> summary.data.frame
+
## boxes total: 2108
+## 
+## boxes by exposure:
+##  indoor  mobile outdoor unknown 
+##     524     200    1383       1 
+## 
+## boxes by model:
+##                   custom          hackair_home_v2             homeEthernet 
+##                      932                        5                        5 
+##    homeEthernetFeinstaub           homeV2Ethernet  homeV2EthernetFeinstaub 
+##                        2                        5                        5 
+##               homeV2Lora               homeV2Wifi      homeV2WifiFeinstaub 
+##                       58                      220                      114 
+##                 homeWifi        homeWifiFeinstaub        luftdaten_pms1003 
+##                       14                       17                        0 
+## luftdaten_pms1003_bme280        luftdaten_pms3003 luftdaten_pms3003_bme280 
+##                        1                        0                        0 
+##        luftdaten_pms5003 luftdaten_pms5003_bme280        luftdaten_pms7003 
+##                        2                        8                        0 
+## luftdaten_pms7003_bme280         luftdaten_sds011  luftdaten_sds011_bme280 
+##                        8                       28                      462 
+##  luftdaten_sds011_bmp180   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
+##                       27                        7                      188 
+## 
+## $last_measurement_within
+##    1h    1d   30d  365d never 
+##   708   758   889  1559   498 
+## 
+## oldest box: 2020-02-29 23:00:31 (Kirchardt 1)
+## newest box: 2022-12-30 09:19:46 (Balkon)
+## 
+## sensors per box:
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   1.000   3.000   5.000   4.978   6.000  29.000
+ + + + + +

Another feature of interest is the spatial distribution of the boxes: +plot() can help us out here. This function requires a bunch +of optional dependencies though.

+
if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(boxes)
+

+

But what do these sensor stations actually measure? Lets find out. +osem_phenomena() gives us a named list of of the counts of +each observed phenomenon for the given set of sensor stations:

+
phenoms = osem_phenomena(boxes)
+str(phenoms)
+
## List of 966
+##  $ Temperatur                             : int 1520
+##  $ rel. Luftfeuchte                       : int 1289
+##  $ PM10                                   : int 1070
+##  $ PM2.5                                  : int 1069
+##  $ Luftdruck                              : int 994
+##  $ Beleuchtungsstärke                     : int 282
+##  $ UV-Intensität                          : int 282
+##  $ VOC                                    : int 224
+##  $ Lufttemperatur                         : int 208
+##  $ CO₂                                    : int 178
+##  $ Bodenfeuchte                           : int 170
+##  $ Temperature                            : int 168
+##  $ Luftfeuchte                            : int 134
+##  $ Lautstärke                             : int 131
+##  $ Humidity                               : int 124
+##  $ atm. Luftdruck                         : int 114
+##  $ Kalibrierungswert                      : int 108
+##  $ CO2eq                                  : int 107
+##  $ IAQ                                    : int 107
+##  $ Temperatur SCD30                       : int 107
+##  $ rel. Luftfeuchte SCD30                 : int 107
+##  $ Pressure                               : int 98
+##  $ Luftfeuchtigkeit                       : int 62
+##  $ PM01                                   : int 58
+##  $ Bodentemperatur                        : int 55
+##  $ Windgeschwindigkeit                    : int 46
+##  $ Feinstaub PM10                         : int 41
+##  $ Feinstaub PM2.5                        : int 32
+##  $ Feinstaub PM1.0                        : int 28
+##  $ Batterie                               : int 22
+##  $ Taupunkt                               : int 19
+##  $ Windrichtung                           : int 19
+##  $ rel. Luftfeuchte (HECA)                : int 16
+##  $ Temperatur (HECA)                      : int 15
+##  $ Temperatura                            : int 15
+##  $ Durchschnitt Umgebungslautstärke       : int 14
+##  $ Helligkeit                             : int 14
+##  $ Latitude                               : int 14
+##  $ Longtitude                             : int 14
+##  $ Minimum Umgebungslautstärke            : int 14
+##  $ RSSI                                   : int 14
+##  $ UV-Index                               : int 11
+##  $ CO2                                    : int 10
+##  $ PM1                                    : int 10
+##  $ rel-. Luftfeuchte                      : int 10
+##  $ temperature                            : int 10
+##  $ Abstand nach links                     : int 9
+##  $ Abstand nach rechts                    : int 9
+##  $ Beschleunigung X-Achse                 : int 9
+##  $ Beschleunigung Y-Achse                 : int 9
+##  $ Beschleunigung Z-Achse                 : int 9
+##  $ Feinstaub PM25                         : int 9
+##  $ Luftdruck absolut                      : int 9
+##  $ Luftdruck relativ                      : int 9
+##  $ Regenrate                              : int 9
+##  $ Sonnenstrahlung                        : int 9
+##  $ gefühlte Temperatur                    : int 9
+##  $ Geschwindigkeit                        : int 8
+##  $ humidity                               : int 8
+##  $ Bodenfeuchtigkeit                      : int 7
+##  $ Baterie                                : int 6
+##  $ Bodenfeuchte 10cm                      : int 6
+##  $ Bodenfeuchte 30cm                      : int 6
+##  $ Bodentemperatur 10cm                   : int 6
+##  $ Bodentemperatur 30cm                   : int 6
+##  $ Lumina                                 : int 6
+##  $ Taupunktdifferenz (Spread)             : int 6
+##  $ UV                                     : int 6
+##  $ Umiditate                              : int 6
+##  $ Air pressure                           : int 5
+##  $ Battery                                : int 5
+##  $ PM4                                    : int 5
+##  $ Pegel                                  : int 5
+##  $ Prezenta-foc                           : int 5
+##  $ Regenmenge                             : int 5
+##  $ W-LAN                                  : int 5
+##  $ Bodenfeuchte 1                         : int 4
+##  $ Bodenfeuchte 2                         : int 4
+##  $ Bodentemperatur 1                      : int 4
+##  $ Bodentemperatur 2                      : int 4
+##  $ CO2 Konzentration                      : int 4
+##  $ Gesamt Gewicht                         : int 4
+##  $ PM 2.5                                 : int 4
+##  $ Température                            : int 4
+##  $ Wilgotność                             : int 4
+##  $ Windspeed                              : int 4
+##  $ absolute Luftfeuchtigkeit              : int 4
+##  $ Battery Voltage                        : int 3
+##  $ Befehl                                 : int 3
+##  $ Bodenfeuchte 50cm                      : int 3
+##  $ Bodentemperatur 60cm                   : int 3
+##  $ Ciśnienie                              : int 3
+##  $ Feinstaub                              : int 3
+##  $ Humedad                                : int 3
+##  $ Höhe                                   : int 3
+##  $ Laufzeit                               : int 3
+##  $ LuftfeuchteBME                         : int 3
+##  $ Luftfeuchtigkeit_01                    : int 3
+##  $ LufttemperaturBME                      : int 3
+##   [list output truncated]
+

Thats quite some noise there, with many phenomena being measured by a +single sensor only, or many duplicated phenomena due to slightly +different spellings. We should clean that up, but for now let’s just +filter out the noise and find those phenomena with high sensor +numbers:

+
phenoms[phenoms > 50]
+
## $Temperatur
+## [1] 1520
+## 
+## $`rel. Luftfeuchte`
+## [1] 1289
+## 
+## $PM10
+## [1] 1070
+## 
+## $PM2.5
+## [1] 1069
+## 
+## $Luftdruck
+## [1] 994
+## 
+## $Beleuchtungsstärke
+## [1] 282
+## 
+## $`UV-Intensität`
+## [1] 282
+## 
+## $VOC
+## [1] 224
+## 
+## $Lufttemperatur
+## [1] 208
+## 
+## $`CO₂`
+## [1] 178
+## 
+## $Bodenfeuchte
+## [1] 170
+## 
+## $Temperature
+## [1] 168
+## 
+## $Luftfeuchte
+## [1] 134
+## 
+## $Lautstärke
+## [1] 131
+## 
+## $Humidity
+## [1] 124
+## 
+## $`atm. Luftdruck`
+## [1] 114
+## 
+## $Kalibrierungswert
+## [1] 108
+## 
+## $CO2eq
+## [1] 107
+## 
+## $IAQ
+## [1] 107
+## 
+## $`Temperatur SCD30`
+## [1] 107
+## 
+## $`rel. Luftfeuchte SCD30`
+## [1] 107
+## 
+## $Pressure
+## [1] 98
+## 
+## $Luftfeuchtigkeit
+## [1] 62
+## 
+## $PM01
+## [1] 58
+## 
+## $Bodentemperatur
+## [1] 55
+
+
+

Plot count of boxes by time

+

By looking at the createdAt attribute of each box we +know the exact time a box was registered. Because of some database +migration issues the createdAt values are mostly wrong +(~80% of boxes created 2022-03-30), so we are using the +timestamp attribute of the currentlocation +which should in most cases correspond to the creation date.

+

With this approach we have no information about boxes that were +deleted in the meantime, but that’s okay for now.

+
+

…and exposure

+
exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(locationtimestamp))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+

Outdoor boxes are growing fast! We can also see the +introduction of mobile sensor “stations” in 2017.

+

Let’s have a quick summary:

+
exposure_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
exposureoldestnewestcount
outdoor2022-01-01 11:59:162022-12-30 09:19:461383
indoor2022-01-02 11:06:082022-12-23 20:46:45524
mobile2022-01-06 13:20:002022-12-21 21:35:16200
unknown2022-03-01 07:04:302022-03-01 07:04:301
+
+
+
+

…and grouptag

+

We can try to find out where the increases in growth came from, by +analysing the box count by grouptag.

+

Caveats: Only a small subset of boxes has a grouptag, and we should +assume that these groups are actually bigger. Also, we can see that +grouptag naming is inconsistent (Luftdaten, +luftdaten.info, …)

+
grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 15 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
+  mutate(count = row_number(locationtimestamp))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+
grouptag_counts %>%
+  summarise(
+    oldest = min(locationtimestamp),
+    newest = max(locationtimestamp),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagoldestnewestcount
edu2022-01-02 11:06:082022-12-18 12:38:27127
HU Explorers2022-04-01 09:07:412022-12-14 10:11:34123
321heiss2022-07-09 01:29:372022-09-01 06:27:3591
Captographies2022-06-03 11:25:272022-11-16 13:26:3958
SUGUCS2022-11-30 15:25:322022-12-03 10:11:4123
SekSeeland2022-03-14 13:17:172022-03-22 20:23:5819
BurgerMeetnet2022-01-24 15:33:192022-05-10 21:22:3516
AGIN2022-11-28 17:33:122022-11-28 17:42:1815
BRGL2022-11-06 19:23:432022-11-06 22:08:3615
BRGW2022-11-02 10:28:522022-11-02 13:32:1215
Burgermeetnet2022-01-15 20:43:162022-02-11 17:59:0515
HTLJ2022-11-21 22:04:172022-11-21 22:05:4715
MSGB2022-11-14 09:08:572022-11-14 10:19:2415
MSHO2022-12-20 09:28:402022-12-20 10:01:3815
MSIN2022-11-21 17:02:392022-11-21 23:06:2215
Mikroprojekt Mitmachklima2022-02-09 10:28:402022-08-23 13:14:1115
+
+
+
+
+

Plot rate of growth and inactivity per week

+

First we group the boxes by locationtimestamp into bins +of one week:

+
bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+

We can do the same for updatedAt, which informs us about +the last change to a box, including uploaded measurements. As a lot of +boxes were “updated” by the database migration, many of them are updated +at 2022-03-30, so we try to use the lastMeasurement +attribute instead of updatedAt. This leads to fewer boxes +but also automatically excludes boxes which were created but never made +a measurement.

+

This method of determining inactive boxes is fairly inaccurate and +should be considered an approximation, because we have no information +about intermediate inactive phases. Also deleted boxes would probably +have a big impact here.

+
inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(lastMeasurement < now() - days(2)) %>%
+  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
+  filter(as.Date(week) > as.Date("2021-12-31")) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+

Now we can combine both datasets for plotting:

+
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+

+

And see in which weeks the most boxes become (in)active:

+
boxes_by_date %>%
+  filter(count > 50) %>%
+  arrange(desc(count))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
weekcountevent
2022-11-2192registered
2022-06-0677registered
2022-08-2976registered
2022-10-3171registered
2022-11-1467registered
2022-11-2865registered
2022-08-2261registered
2022-02-2857registered
2022-08-2955inactive
2022-03-2154registered
2022-12-1254registered
2022-01-2451registered
+
+
+
+

Plot duration of boxes being active

+

While we are looking at locationtimestamp and +lastMeasurement, we can also extract the duration of +activity of each box, and look at metrics by exposure and grouptag once +more:

+
+

…by exposure

+
durations = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(durations, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+

The time of activity averages at only 145 days, though there are +boxes with 418 days of activity, spanning a large chunk of +openSenseMap’s existence.

+
+
+

…by grouptag

+
durations = boxes %>%
+  filter(!is.na(lastMeasurement)) %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 20 or more members
+  filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+  
+ggplot(durations, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+
durations %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagduration_avgduration_minduration_maxoldest_box
Burgermeetnet236 days0 days404 days404 days
BurgerMeetnet169 days0 days395 days395 days
Captographies122 days0 days261 days263 days
BRGL107 days85 days109 days109 days
MSGB87 days39 days101 days101 days
edu87 days0 days408 days408 days
AGIN86 days86 days86 days87 days
HTLJ84 days58 days94 days94 days
MSHO57 days36 days65 days65 days
HU Explorers44 days0 days319 days328 days
321heiss0 days0 days0 days229 days
+
+

The time of activity averages at only 70 days, though there are boxes +with 408 days of activity, spanning a large chunk of openSenseMap’s +existence.

+
+
+

…by year of registration

+

This is less useful, as older boxes are active for a longer time by +definition. If you have an idea how to compensate for that, please send +a Pull +Request!

+
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(lastMeasurement)) %>%
+  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
+  filter(duration >= 0)
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+

+
+
+
+

More Visualisations

+

Other visualisations come to mind, and are left as an exercise to the +reader. If you implemented some, feel free to add them to this vignette +via a Pull +Request.

+
    +
  • growth by phenomenon
  • +
  • growth by location -> (interactive) map
  • +
  • set inactive rate in relation to total box count
  • +
  • filter timespans with big dips in growth rate, and extrapolate the +amount of senseBoxes that could be on the platform today, assuming there +were no production issues ;)
  • +
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/inst/doc/osem-intro.R b/inst/doc/osem-intro.R new file mode 100644 index 0000000..0732734 --- /dev/null +++ b/inst/doc/osem-intro.R @@ -0,0 +1,73 @@ +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set(echo = TRUE) + +## ----results = F-------------------------------------------------------------- +library(magrittr) +library(opensensmapr) + +all_sensors = osem_boxes() + +## ----------------------------------------------------------------------------- +summary(all_sensors) + +## ----message=F, warning=F----------------------------------------------------- +if (!require('maps')) install.packages('maps') +if (!require('maptools')) install.packages('maptools') +if (!require('rgeos')) install.packages('rgeos') + +plot(all_sensors) + +## ----------------------------------------------------------------------------- +phenoms = osem_phenomena(all_sensors) +str(phenoms) + +## ----------------------------------------------------------------------------- +phenoms[phenoms > 20] + +## ----results = F-------------------------------------------------------------- +pm25_sensors = osem_boxes( + exposure = 'outdoor', + date = Sys.time(), # ±4 hours + phenomenon = 'PM2.5' +) + +## ----------------------------------------------------------------------------- +summary(pm25_sensors) +plot(pm25_sensors) + +## ----------------------------------------------------------------------------- +library(sf) +library(units) +library(lubridate) +library(dplyr) + +# construct a bounding box: 12 kilometers around Berlin +berlin = st_point(c(13.4034, 52.5120)) %>% + st_sfc(crs = 4326) %>% + st_transform(3857) %>% # allow setting a buffer in meters + st_buffer(set_units(12, km)) %>% + st_transform(4326) %>% # the opensensemap expects WGS 84 + st_bbox() + +## ----results = F-------------------------------------------------------------- +pm25 = osem_measurements( + berlin, + phenomenon = 'PM2.5', + from = now() - days(3), # defaults to 2 days + to = now() +) + +plot(pm25) + +## ----------------------------------------------------------------------------- +outliers = filter(pm25, value > 100)$sensorId +bad_sensors = outliers[, drop = T] %>% levels() + +pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) + +## ----------------------------------------------------------------------------- +st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) + +## ----------------------------------------------------------------------------- +pm25 %>% filter(invalid == FALSE) %>% plot() + diff --git a/inst/doc/osem-intro.Rmd b/inst/doc/osem-intro.Rmd new file mode 100644 index 0000000..8714a9f --- /dev/null +++ b/inst/doc/osem-intro.Rmd @@ -0,0 +1,151 @@ +--- +title: "Exploring the openSenseMap Dataset" +author: "Norwin Roosen" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + fig_margin: 0 + fig_width: 6 + fig_height: 4 +vignette: > + %\VignetteIndexEntry{Exploring the openSenseMap Dataset} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +This package provides data ingestion functions for almost any data stored on the +open data platform for environmental sensordata . +Its main goals are to provide means for: + +- big data analysis of the measurements stored on the platform +- sensor metadata analysis (sensor counts, spatial distribution, temporal trends) + +### Exploring the dataset +Before we look at actual observations, lets get a grasp of the openSenseMap +datasets' structure. + +```{r results = F} +library(magrittr) +library(opensensmapr) + +all_sensors = osem_boxes() +``` +```{r} +summary(all_sensors) +``` + +This gives a good overview already: As of writing this, there are more than 700 +sensor stations, of which ~50% are currently running. Most of them are placed +outdoors and have around 5 sensors each. +The oldest station is from May 2014, while the latest station was registered a +couple of minutes ago. + +Another feature of interest is the spatial distribution of the boxes: `plot()` +can help us out here. This function requires a bunch of optional dependencies though. + +```{r message=F, warning=F} +if (!require('maps')) install.packages('maps') +if (!require('maptools')) install.packages('maptools') +if (!require('rgeos')) install.packages('rgeos') + +plot(all_sensors) +``` + +It seems we have to reduce our area of interest to Germany. + +But what do these sensor stations actually measure? Lets find out. +`osem_phenomena()` gives us a named list of of the counts of each observed +phenomenon for the given set of sensor stations: + +```{r} +phenoms = osem_phenomena(all_sensors) +str(phenoms) +``` + +Thats quite some noise there, with many phenomena being measured by a single +sensor only, or many duplicated phenomena due to slightly different spellings. +We should clean that up, but for now let's just filter out the noise and find +those phenomena with high sensor numbers: + +```{r} +phenoms[phenoms > 20] +``` + +Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more +interesting to analyze though. +We should check how many sensor stations provide useful data: We want only those +boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting +measurements: + +```{r results = F} +pm25_sensors = osem_boxes( + exposure = 'outdoor', + date = Sys.time(), # ±4 hours + phenomenon = 'PM2.5' +) +``` +```{r} +summary(pm25_sensors) +plot(pm25_sensors) +``` + +Thats still more than 200 measuring stations, we can work with that. + +### Analyzing sensor data +Having analyzed the available data sources, let's finally get some measurements. +We could call `osem_measurements(pm25_sensors)` now, however we are focusing on +a restricted area of interest, the city of Berlin. +Luckily we can get the measurements filtered by a bounding box: + +```{r} +library(sf) +library(units) +library(lubridate) +library(dplyr) + +# construct a bounding box: 12 kilometers around Berlin +berlin = st_point(c(13.4034, 52.5120)) %>% + st_sfc(crs = 4326) %>% + st_transform(3857) %>% # allow setting a buffer in meters + st_buffer(set_units(12, km)) %>% + st_transform(4326) %>% # the opensensemap expects WGS 84 + st_bbox() +``` +```{r results = F} +pm25 = osem_measurements( + berlin, + phenomenon = 'PM2.5', + from = now() - days(3), # defaults to 2 days + to = now() +) + +plot(pm25) +``` + +Now we can get started with actual spatiotemporal data analysis. +First, lets mask the seemingly uncalibrated sensors: + +```{r} +outliers = filter(pm25, value > 100)$sensorId +bad_sensors = outliers[, drop = T] %>% levels() + +pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) +``` + +Then plot the measuring locations, flagging the outliers: + +```{r} +st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) +``` + +Removing these sensors yields a nicer time series plot: + +```{r} +pm25 %>% filter(invalid == FALSE) %>% plot() +``` + +Further analysis: comparison with LANUV data `TODO` diff --git a/inst/doc/osem-intro.html b/inst/doc/osem-intro.html new file mode 100644 index 0000000..f28d894 --- /dev/null +++ b/inst/doc/osem-intro.html @@ -0,0 +1,870 @@ + + + + + + + + + + + + + + + + +Exploring the openSenseMap Dataset + + + + + + + + + + + + + + + + + + + + + + + + + + +

Exploring the openSenseMap Dataset

+

Norwin Roosen

+

2023-02-23

+ + + +

This package provides data ingestion functions for almost any data +stored on the open data platform for environmental sensordata https://opensensemap.org. Its main goals are to provide +means for:

+ +
+

Exploring the dataset

+

Before we look at actual observations, lets get a grasp of the +openSenseMap datasets’ structure.

+
library(magrittr)
+library(opensensmapr)
+
+all_sensors = osem_boxes()
+
summary(all_sensors)
+
## boxes total: 11367
+## 
+## boxes by exposure:
+##  indoor  mobile outdoor unknown 
+##    2344     591    8413      19 
+## 
+## boxes by model:
+##                   custom          hackair_home_v2             homeEthernet 
+##                     2776                       73                       73 
+##    homeEthernetFeinstaub           homeV2Ethernet  homeV2EthernetFeinstaub 
+##                       55                       21                       40 
+##               homeV2Lora               homeV2Wifi      homeV2WifiFeinstaub 
+##                      246                      578                      743 
+##                 homeWifi        homeWifiFeinstaub        luftdaten_pms1003 
+##                      215                      222                        9 
+## luftdaten_pms1003_bme280        luftdaten_pms3003 luftdaten_pms3003_bme280 
+##                       10                        1                        7 
+##        luftdaten_pms5003 luftdaten_pms5003_bme280        luftdaten_pms7003 
+##                        7                       60                        6 
+## luftdaten_pms7003_bme280         luftdaten_sds011  luftdaten_sds011_bme280 
+##                       78                      285                     3060 
+##  luftdaten_sds011_bmp180   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
+##                      114                      135                     2553 
+## 
+## $last_measurement_within
+##    1h    1d   30d  365d never 
+##  3601  3756  4252  5938  2052 
+## 
+## oldest box: 2016-08-09 19:34:42 (OBS Bohmte UK_02)
+## newest box: 2023-02-23 07:56:59 (Steinbrink 29)
+## 
+## sensors per box:
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   1.000   4.000   5.000   4.981   5.000  76.000
+

This gives a good overview already: As of writing this, there are +more than 700 sensor stations, of which ~50% are currently running. Most +of them are placed outdoors and have around 5 sensors each. The oldest +station is from May 2014, while the latest station was registered a +couple of minutes ago.

+

Another feature of interest is the spatial distribution of the boxes: +plot() can help us out here. This function requires a bunch +of optional dependencies though.

+
if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(all_sensors)
+

+

It seems we have to reduce our area of interest to Germany.

+

But what do these sensor stations actually measure? Lets find out. +osem_phenomena() gives us a named list of of the counts of +each observed phenomenon for the given set of sensor stations:

+
phenoms = osem_phenomena(all_sensors)
+str(phenoms)
+
## List of 3289
+##  $ Temperatur                                                   : int 9385
+##  $ rel. Luftfeuchte                                             : int 8317
+##  $ PM10                                                         : int 8147
+##  $ PM2.5                                                        : int 8135
+##  $ Luftdruck                                                    : int 5667
+##  $ Beleuchtungsstärke                                           : int 1674
+##  $ UV-Intensität                                                : int 1665
+##  $ Temperature                                                  : int 643
+##  $ Humidity                                                     : int 473
+##  $ VOC                                                          : int 422
+##  $ Luftfeuchte                                                  : int 362
+##  $ Lufttemperatur                                               : int 356
+##  $ CO₂                                                          : int 304
+##  $ Pressure                                                     : int 293
+##  $ Bodenfeuchte                                                 : int 284
+##  $ Luftfeuchtigkeit                                             : int 272
+##  $ atm. Luftdruck                                               : int 245
+##  $ Lautstärke                                                   : int 240
+##  $ PM01                                                         : int 206
+##  $ IAQ                                                          : int 162
+##  $ Kalibrierungswert                                            : int 156
+##  $ rel. Luftfeuchte SCD30                                       : int 156
+##  $ Bodentemperatur                                              : int 155
+##  $ Temperatur SCD30                                             : int 154
+##  $ CO2eq                                                        : int 153
+##  $ Windgeschwindigkeit                                          : int 152
+##  $ pH-Wert                                                      : int 123
+##  $ Gesamthärte                                                  : int 122
+##  $ Blei                                                         : int 120
+##  $ Eisen                                                        : int 120
+##  $ GesamthaerteLabor                                            : int 120
+##  $ Gesamthärte 2                                                : int 120
+##  $ Kupfer C                                                     : int 120
+##  $ Kupfer D                                                     : int 120
+##  $ Kupfer1                                                      : int 120
+##  $ Kupfer2                                                      : int 120
+##  $ Nitrat                                                       : int 120
+##  $ Nitrit                                                       : int 120
+##  $ CO2                                                          : int 112
+##  $ Feinstaub PM10                                               : int 98
+##  $ Windrichtung                                                 : int 82
+##  $ rel. Luftfeuchte (HECA)                                      : int 74
+##  $ Temperatur (HECA)                                            : int 72
+##  $ Temperatura                                                  : int 69
+##  $ Helligkeit                                                   : int 67
+##  $ Feinstaub PM2.5                                              : int 65
+##  $ Taupunkt                                                     : int 62
+##  $ Latitude                                                     : int 61
+##  $ Longtitude                                                   : int 58
+##  $ Durchschnitt Umgebungslautstärke                             : int 51
+##  $ Minimum Umgebungslautstärke                                  : int 51
+##  $ UV-Index                                                     : int 49
+##  $ temperature                                                  : int 46
+##  $ Batterie                                                     : int 45
+##  $ Feinstaub PM1.0                                              : int 41
+##  $ Umgebungslautstärke                                          : int 41
+##  $ UV                                                           : int 40
+##  $ humidity                                                     : int 38
+##  $ Abstand nach links                                           : int 34
+##  $ Beschleunigung Z-Achse                                       : int 34
+##  $ Beschleunigung X-Achse                                       : int 33
+##  $ Beschleunigung Y-Achse                                       : int 33
+##  $ Geschwindigkeit                                              : int 33
+##  $ Niederschlag                                                 : int 33
+##  $ Feinstaub PM25                                               : int 32
+##  $ PM1                                                          : int 32
+##  $ Abstand nach rechts                                          : int 31
+##  $ PM1.0                                                        : int 30
+##  $ rel. Luftfeuchtigkeit                                        : int 30
+##  $ Relative Humidity                                            : int 29
+##  $ Sonnenstrahlung                                              : int 29
+##  $ Luftdruck relativ                                            : int 28
+##  $ Luftdruck absolut                                            : int 26
+##  $ Rain                                                         : int 26
+##  $ Regenrate                                                    : int 26
+##  $ CO2 Konzentration                                            : int 25
+##  $ RSSI                                                         : int 22
+##  $ gefühlte Temperatur                                          : int 22
+##  $ PM 2.5                                                       : int 21
+##  $ Battery                                                      : int 20
+##  $ Ciśnienie                                                    : int 20
+##  $ Air Pressure                                                 : int 19
+##  $ Regen                                                        : int 19
+##  $ Schall                                                       : int 19
+##  $ Signal                                                       : int 19
+##  $ Ilmanpaine                                                   : int 18
+##  $ Lämpötila                                                    : int 18
+##  $ UV Index                                                     : int 18
+##  $ Wind speed                                                   : int 18
+##  $ PM 10                                                        : int 17
+##  $ PM4                                                          : int 17
+##  $ Air pressure                                                 : int 16
+##  $ Temperatur DHT22                                             : int 16
+##  $ Wind Direction                                               : int 16
+##  $ Altitude                                                     : int 15
+##  $ Illuminance                                                  : int 15
+##  $ Speed                                                        : int 15
+##  $ Wind Speed                                                   : int 15
+##  $ pressure                                                     : int 15
+##   [list output truncated]
+

Thats quite some noise there, with many phenomena being measured by a +single sensor only, or many duplicated phenomena due to slightly +different spellings. We should clean that up, but for now let’s just +filter out the noise and find those phenomena with high sensor +numbers:

+
phenoms[phenoms > 20]
+
## $Temperatur
+## [1] 9385
+## 
+## $`rel. Luftfeuchte`
+## [1] 8317
+## 
+## $PM10
+## [1] 8147
+## 
+## $PM2.5
+## [1] 8135
+## 
+## $Luftdruck
+## [1] 5667
+## 
+## $Beleuchtungsstärke
+## [1] 1674
+## 
+## $`UV-Intensität`
+## [1] 1665
+## 
+## $Temperature
+## [1] 643
+## 
+## $Humidity
+## [1] 473
+## 
+## $VOC
+## [1] 422
+## 
+## $Luftfeuchte
+## [1] 362
+## 
+## $Lufttemperatur
+## [1] 356
+## 
+## $`CO₂`
+## [1] 304
+## 
+## $Pressure
+## [1] 293
+## 
+## $Bodenfeuchte
+## [1] 284
+## 
+## $Luftfeuchtigkeit
+## [1] 272
+## 
+## $`atm. Luftdruck`
+## [1] 245
+## 
+## $Lautstärke
+## [1] 240
+## 
+## $PM01
+## [1] 206
+## 
+## $IAQ
+## [1] 162
+## 
+## $Kalibrierungswert
+## [1] 156
+## 
+## $`rel. Luftfeuchte SCD30`
+## [1] 156
+## 
+## $Bodentemperatur
+## [1] 155
+## 
+## $`Temperatur SCD30`
+## [1] 154
+## 
+## $CO2eq
+## [1] 153
+## 
+## $Windgeschwindigkeit
+## [1] 152
+## 
+## $`pH-Wert`
+## [1] 123
+## 
+## $Gesamthärte
+## [1] 122
+## 
+## $Blei
+## [1] 120
+## 
+## $Eisen
+## [1] 120
+## 
+## $GesamthaerteLabor
+## [1] 120
+## 
+## $`Gesamthärte 2`
+## [1] 120
+## 
+## $`Kupfer C`
+## [1] 120
+## 
+## $`Kupfer D`
+## [1] 120
+## 
+## $Kupfer1
+## [1] 120
+## 
+## $Kupfer2
+## [1] 120
+## 
+## $Nitrat
+## [1] 120
+## 
+## $Nitrit
+## [1] 120
+## 
+## $CO2
+## [1] 112
+## 
+## $`Feinstaub PM10`
+## [1] 98
+## 
+## $Windrichtung
+## [1] 82
+## 
+## $`rel. Luftfeuchte (HECA)`
+## [1] 74
+## 
+## $`Temperatur (HECA)`
+## [1] 72
+## 
+## $Temperatura
+## [1] 69
+## 
+## $Helligkeit
+## [1] 67
+## 
+## $`Feinstaub PM2.5`
+## [1] 65
+## 
+## $Taupunkt
+## [1] 62
+## 
+## $Latitude
+## [1] 61
+## 
+## $Longtitude
+## [1] 58
+## 
+## $`Durchschnitt Umgebungslautstärke`
+## [1] 51
+## 
+## $`Minimum Umgebungslautstärke`
+## [1] 51
+## 
+## $`UV-Index`
+## [1] 49
+## 
+## $temperature
+## [1] 46
+## 
+## $Batterie
+## [1] 45
+## 
+## $`Feinstaub PM1.0`
+## [1] 41
+## 
+## $Umgebungslautstärke
+## [1] 41
+## 
+## $UV
+## [1] 40
+## 
+## $humidity
+## [1] 38
+## 
+## $`Abstand nach links`
+## [1] 34
+## 
+## $`Beschleunigung Z-Achse`
+## [1] 34
+## 
+## $`Beschleunigung X-Achse`
+## [1] 33
+## 
+## $`Beschleunigung Y-Achse`
+## [1] 33
+## 
+## $Geschwindigkeit
+## [1] 33
+## 
+## $Niederschlag
+## [1] 33
+## 
+## $`Feinstaub PM25`
+## [1] 32
+## 
+## $PM1
+## [1] 32
+## 
+## $`Abstand nach rechts`
+## [1] 31
+## 
+## $PM1.0
+## [1] 30
+## 
+## $`rel. Luftfeuchtigkeit`
+## [1] 30
+## 
+## $`Relative Humidity`
+## [1] 29
+## 
+## $Sonnenstrahlung
+## [1] 29
+## 
+## $`Luftdruck relativ`
+## [1] 28
+## 
+## $`Luftdruck absolut`
+## [1] 26
+## 
+## $Rain
+## [1] 26
+## 
+## $Regenrate
+## [1] 26
+## 
+## $`CO2 Konzentration`
+## [1] 25
+## 
+## $RSSI
+## [1] 22
+## 
+## $`gefühlte Temperatur`
+## [1] 22
+## 
+## $`PM 2.5`
+## [1] 21
+

Alright, temperature it is! Fine particulate matter (PM2.5) seems to +be more interesting to analyze though. We should check how many sensor +stations provide useful data: We want only those boxes with a PM2.5 +sensor, that are placed outdoors and are currently submitting +measurements:

+
pm25_sensors = osem_boxes(
+  exposure = 'outdoor',
+  date = Sys.time(), # ±4 hours
+  phenomenon = 'PM2.5'
+)
+
summary(pm25_sensors)
+
## boxes total: 3002
+## 
+## boxes by exposure:
+## outdoor 
+##    3002 
+## 
+## boxes by model:
+##                   custom          hackair_home_v2    homeEthernetFeinstaub 
+##                      174                        8                       12 
+##  homeV2EthernetFeinstaub               homeV2Lora               homeV2Wifi 
+##                       10                       21                        2 
+##      homeV2WifiFeinstaub                 homeWifi        homeWifiFeinstaub 
+##                      126                        3                       30 
+##        luftdaten_pms1003 luftdaten_pms1003_bme280        luftdaten_pms5003 
+##                        1                        2                        3 
+## luftdaten_pms5003_bme280        luftdaten_pms7003 luftdaten_pms7003_bme280 
+##                       11                        2                       26 
+##         luftdaten_sds011  luftdaten_sds011_bme280  luftdaten_sds011_bmp180 
+##                      115                     1365                       59 
+##   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
+##                       45                      987 
+## 
+## $last_measurement_within
+##    1h    1d   30d  365d never 
+##  2977  3002  3002  3002     0 
+## 
+## oldest box: 2017-03-03 18:20:43 (Witten Heven Dorf)
+## newest box: 2023-02-23 07:56:59 (Steinbrink 29)
+## 
+## sensors per box:
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   2.000   4.000   5.000   4.838   5.000  26.000
+
plot(pm25_sensors)
+

+

Thats still more than 200 measuring stations, we can work with +that.

+
+
+

Analyzing sensor data

+

Having analyzed the available data sources, let’s finally get some +measurements. We could call osem_measurements(pm25_sensors) +now, however we are focusing on a restricted area of interest, the city +of Berlin. Luckily we can get the measurements filtered by a bounding +box:

+
library(sf)
+
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
+
library(units)
+
## udunits database from C:/Software/RPackages/units/share/udunits/udunits2.xml
+
library(lubridate)
+library(dplyr)
+
+# construct a bounding box: 12 kilometers around Berlin
+berlin = st_point(c(13.4034, 52.5120)) %>%
+  st_sfc(crs = 4326) %>%
+  st_transform(3857) %>% # allow setting a buffer in meters
+  st_buffer(set_units(12, km)) %>%
+  st_transform(4326) %>% # the opensensemap expects WGS 84
+  st_bbox()
+
pm25 = osem_measurements(
+  berlin,
+  phenomenon = 'PM2.5',
+  from = now() - days(3), # defaults to 2 days
+  to = now()
+)
+
+plot(pm25)
+

+

Now we can get started with actual spatiotemporal data analysis. +First, lets mask the seemingly uncalibrated sensors:

+
outliers = filter(pm25, value > 100)$sensorId
+bad_sensors = outliers[, drop = T] %>% levels()
+
+pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
+

Then plot the measuring locations, flagging the outliers:

+
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
+

+

Removing these sensors yields a nicer time series plot:

+
pm25 %>% filter(invalid == FALSE) %>% plot()
+

+

Further analysis: comparison with LANUV data TODO

+
+ + + + + + + + + + + diff --git a/inst/doc/osem-serialization.R b/inst/doc/osem-serialization.R new file mode 100644 index 0000000..ef056b6 --- /dev/null +++ b/inst/doc/osem-serialization.R @@ -0,0 +1,51 @@ +## ----setup, results='hide'---------------------------------------------------- +# this vignette requires: +library(opensensmapr) +library(jsonlite) +library(readr) + +## ----cache-------------------------------------------------------------------- +b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) + +# the next identical request will hit the cache only! +b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) + +# requests without the cache parameter will still be performed normally +b = osem_boxes(grouptag = 'ifgi') + +## ----cachelisting------------------------------------------------------------- +list.files(tempdir(), pattern = 'osemcache\\..*\\.rds') + +## ----cache_custom------------------------------------------------------------- +cacheDir = getwd() # current working directory +b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) + +# the next identical request will hit the cache only! +b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) + +## ----clearcache, results='hide'----------------------------------------------- +osem_clear_cache() # clears default cache +osem_clear_cache(getwd()) # clears a custom cache + +## ----data, results='hide'----------------------------------------------------- +# first get our example data: +measurements = osem_measurements('Windgeschwindigkeit') + +## ----serialize_json----------------------------------------------------------- +# serializing senseBoxes to JSON, and loading from file again: +write(jsonlite::serializeJSON(measurements), 'measurements.json') +measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json')) +class(measurements_from_file) + +## ----serialize_attrs---------------------------------------------------------- +# note the toJSON call instead of serializeJSON +write(jsonlite::toJSON(measurements), 'measurements_bad.json') +measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json') +class(measurements_without_attrs) + +measurements_with_attrs = osem_as_measurements(measurements_without_attrs) +class(measurements_with_attrs) + +## ----cleanup, include=FALSE--------------------------------------------------- +file.remove('measurements.json', 'measurements_bad.json') + diff --git a/inst/doc/osem-serialization.Rmd b/inst/doc/osem-serialization.Rmd new file mode 100644 index 0000000..c09e567 --- /dev/null +++ b/inst/doc/osem-serialization.Rmd @@ -0,0 +1,106 @@ +--- +title: "Caching openSenseMap Data for Reproducibility" +author: "Norwin Roosen" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +It may be useful to download data from openSenseMap only once. +For reproducible results, the data should be saved to disk, and reloaded at a +later point. + +This avoids.. + +- changed results for queries without date parameters, +- unnecessary wait times, +- risk of API changes / API unavailability, +- stress on the openSenseMap-server. + +This vignette shows how to use this built in `opensensmapr` feature, and +how to do it yourself in case you want to save to other data formats. + +```{r setup, results='hide'} +# this vignette requires: +library(opensensmapr) +library(jsonlite) +library(readr) +``` + +## Using the opensensmapr Caching Feature +All data retrieval functions of `opensensmapr` have a built in caching feature, +which serializes an API response to disk. +Subsequent identical requests will then return the serialized data instead of making +another request. + +To use this feature, just add a path to a directory to the `cache` parameter: +```{r cache} +b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) + +# the next identical request will hit the cache only! +b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) + +# requests without the cache parameter will still be performed normally +b = osem_boxes(grouptag = 'ifgi') +``` + +Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL: +```{r cachelisting} +list.files(tempdir(), pattern = 'osemcache\\..*\\.rds') +``` + +You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory: +```{r cache_custom} +cacheDir = getwd() # current working directory +b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) + +# the next identical request will hit the cache only! +b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) +``` + +To get fresh results again, just call `osem_clear_cache()` for the respective cache: +```{r clearcache, results='hide'} +osem_clear_cache() # clears default cache +osem_clear_cache(getwd()) # clears a custom cache +``` + +## Custom (De-) Serialization +If you want to roll your own serialization method to support custom data formats, +here's how: + +```{r data, results='hide'} +# first get our example data: +measurements = osem_measurements('Windgeschwindigkeit') +``` + +If you are paranoid and worry about `.rds` files not being decodable anymore +in the (distant) future, you could serialize to a plain text format such as JSON. +This of course comes at the cost of storage space and performance. +```{r serialize_json} +# serializing senseBoxes to JSON, and loading from file again: +write(jsonlite::serializeJSON(measurements), 'measurements.json') +measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json')) +class(measurements_from_file) +``` + +This method also persists the R object metadata (classes, attributes). +If you were to use a serialization method that can't persist object metadata, you +could re-apply it with the following functions: + +```{r serialize_attrs} +# note the toJSON call instead of serializeJSON +write(jsonlite::toJSON(measurements), 'measurements_bad.json') +measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json') +class(measurements_without_attrs) + +measurements_with_attrs = osem_as_measurements(measurements_without_attrs) +class(measurements_with_attrs) +``` +The same goes for boxes via `osem_as_sensebox()`. + +```{r cleanup, include=FALSE} +file.remove('measurements.json', 'measurements_bad.json') +``` diff --git a/inst/doc/osem-serialization.html b/inst/doc/osem-serialization.html new file mode 100644 index 0000000..e9bb8f3 --- /dev/null +++ b/inst/doc/osem-serialization.html @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + + +Caching openSenseMap Data for Reproducibility + + + + + + + + + + + + + + + + + + + + + + + + + + +

Caching openSenseMap Data for +Reproducibility

+

Norwin Roosen

+

2023-02-23

+ + + +

It may be useful to download data from openSenseMap only once. For +reproducible results, the data should be saved to disk, and reloaded at +a later point.

+

This avoids..

+ +

This vignette shows how to use this built in +opensensmapr feature, and how to do it yourself in case you +want to save to other data formats.

+
# this vignette requires:
+library(opensensmapr)
+library(jsonlite)
+library(readr)
+
+

Using the opensensmapr Caching Feature

+

All data retrieval functions of opensensmapr have a +built in caching feature, which serializes an API response to disk. +Subsequent identical requests will then return the serialized data +instead of making another request.

+

To use this feature, just add a path to a directory to the +cache parameter:

+
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
+
+# requests without the cache parameter will still be performed normally
+b = osem_boxes(grouptag = 'ifgi')
+

Looking at the cache directory we can see one file for each request, +which is identified through a hash of the request URL:

+
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
+
## [1] "osemcache.17db5c57fc6fca4d836fa2cf30345ce8767cd61a.rds"
+

You can maintain multiple caches simultaneously which allows to only +store data related to a script in the same directory:

+
cacheDir = getwd() # current working directory
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+
+# the next identical request will hit the cache only!
+b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
+

To get fresh results again, just call osem_clear_cache() +for the respective cache:

+
osem_clear_cache()        # clears default cache
+osem_clear_cache(getwd()) # clears a custom cache
+
+
+

Custom (De-) Serialization

+

If you want to roll your own serialization method to support custom +data formats, here’s how:

+
# first get our example data:
+measurements = osem_measurements('Windgeschwindigkeit')
+

If you are paranoid and worry about .rds files not being +decodable anymore in the (distant) future, you could serialize to a +plain text format such as JSON. This of course comes at the cost of +storage space and performance.

+
# serializing senseBoxes to JSON, and loading from file again:
+write(jsonlite::serializeJSON(measurements), 'measurements.json')
+measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
+class(measurements_from_file)
+
## [1] "osem_measurements" "tbl_df"            "tbl"              
+## [4] "data.frame"
+

This method also persists the R object metadata (classes, +attributes). If you were to use a serialization method that can’t +persist object metadata, you could re-apply it with the following +functions:

+
# note the toJSON call instead of serializeJSON
+write(jsonlite::toJSON(measurements), 'measurements_bad.json')
+measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
+class(measurements_without_attrs)
+
## [1] "data.frame"
+
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
+class(measurements_with_attrs)
+
## [1] "osem_measurements" "tbl_df"            "tbl"              
+## [4] "data.frame"
+

The same goes for boxes via osem_as_sensebox().

+
+ + + + + + + + + + +