From 38008b1e6cc0a003e63efba8b37b90dced7a341a Mon Sep 17 00:00:00 2001 From: Norwin Roosen Date: Sat, 26 May 2018 12:52:02 +0200 Subject: [PATCH] add/update vignette builds --- inst/doc/osem-history.R | 133 ++++++++ inst/doc/osem-history.Rmd | 243 ++++++++++++++ inst/doc/osem-history.html | 501 +++++++++++++++++++++++++++++ inst/doc/osem-intro.Rmd | 6 +- inst/doc/osem-intro.html | 535 +++++++++---------------------- inst/doc/osem-serialization.Rmd | 4 +- inst/doc/osem-serialization.html | 510 +++++++++-------------------- vignettes/osem-history.Rmd | 6 +- vignettes/osem-intro.Rmd | 6 +- vignettes/osem-serialization.Rmd | 4 +- 10 files changed, 1202 insertions(+), 746 deletions(-) create mode 100644 inst/doc/osem-history.R create mode 100644 inst/doc/osem-history.Rmd create mode 100644 inst/doc/osem-history.html diff --git a/inst/doc/osem-history.R b/inst/doc/osem-history.R new file mode 100644 index 0000000..09048eb --- /dev/null +++ b/inst/doc/osem-history.R @@ -0,0 +1,133 @@ +## ----setup, results='hide', message=FALSE, warning=FALSE----------------- +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() + +## ----download------------------------------------------------------------ +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes = osem_boxes() + +## ----exposure_counts, message=FALSE-------------------------------------- +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(createdAt)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') + +## ----exposure_summary---------------------------------------------------- +exposure_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----grouptag_counts, message=FALSE-------------------------------------- +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% + mutate(count = row_number(createdAt)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') + +## ----grouptag_summary---------------------------------------------------- +grouptag_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) + +## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'---- +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') + +## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'---- +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(updatedAt < now() - days(2)) %>% + mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') + +## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------ +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) + +## ----exposure_duration, message=FALSE------------------------------------ +duration = boxes %>% + group_by(exposure) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +## ----grouptag_duration, message=FALSE------------------------------------ +duration = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +duration %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), createdAt, units='days'))) + ) %>% + arrange(desc(duration_avg)) + +## ----year_duration, message=FALSE---------------------------------------- +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') + diff --git a/inst/doc/osem-history.Rmd b/inst/doc/osem-history.Rmd new file mode 100644 index 0000000..ff9f3d3 --- /dev/null +++ b/inst/doc/osem-history.Rmd @@ -0,0 +1,243 @@ +--- +title: "Visualising the History of openSenseMap.org" +author: "Norwin Roosen" +date: '`r Sys.Date()`' +output: + rmarkdown::html_vignette: + df_print: kable + fig_height: 5 + fig_width: 7 + toc: yes + html_document: + code_folding: hide + df_print: kable + theme: lumen + toc: yes + toc_float: yes +vignette: > + %\VignetteIndexEntry{Visualising the History of openSenseMap.org} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +> This vignette serves as an example on data wrangling & visualization with +`opensensmapr`, `dplyr` and `ggplot2`. + +```{r setup, results='hide', message=FALSE, warning=FALSE} +# required packages: +library(opensensmapr) # data download +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() +``` + +openSenseMap.org has grown quite a bit in the last years; it would be interesting +to see how we got to the current `r osem_counts()$boxes` sensor stations, +split up by various attributes of the boxes. + +While `opensensmapr` provides extensive methods of filtering boxes by attributes +on the server, we do the filtering within R to save time and gain flexibility. +So the first step is to retrieve *all the boxes*: + +```{r download} +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes = osem_boxes() +``` + +# Plot count of boxes by time {.tabset} +By looking at the `createdAt` attribute of each box we know the exact time a box +was registered. +With this approach we have no information about boxes that were deleted in the +meantime, but that's okay for now. + +## ...and exposure +```{r exposure_counts, message=FALSE} +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(createdAt)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') +``` + +Outdoor boxes are growing *fast*! +We can also see the introduction of `mobile` sensor "stations" in 2017. While +mobile boxes are still few, we can expect a quick rise in 2018 once the new +[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU). + +Let's have a quick summary: +```{r exposure_summary} +exposure_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +## ...and grouptag +We can try to find out where the increases in growth came from, by analysing the +box count by grouptag. + +Caveats: Only a small subset of boxes has a grouptag, and we should assume +that these groups are actually bigger. Also, we can see that grouptag naming is +inconsistent (`Luftdaten`, `luftdaten.info`, ...) + +```{r grouptag_counts, message=FALSE} +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% + mutate(count = row_number(createdAt)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') +``` + +```{r grouptag_summary} +grouptag_counts %>% + summarise( + oldest = min(createdAt), + newest = max(createdAt), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +# Plot rate of growth and inactivity per week +First we group the boxes by `createdAt` into bins of one week: +```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') +``` + +We can do the same for `updatedAt`, which informs us about the last change to +a box, including uploaded measurements. +This method of determining inactive boxes is fairly inaccurate and should be +considered an approximation, because we have no information about intermediate +inactive phases. +Also deleted boxes would probably have a big impact here. +```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(updatedAt < now() - days(2)) %>% + mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') +``` + +Now we can combine both datasets for plotting: +```{r growthrate, warning=FALSE, message=FALSE, results='hide'} +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) +``` + +We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`. +This was enabled by an integration of openSenseMap.org into the firmware of the +air quality monitoring project [luftdaten.info](https://luftdaten.info). +The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues +of the senseBox hardware, but I have no data on the exact time frames to verify. + +# Plot duration of boxes being active {.tabset} +While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity +of each box, and look at metrics by exposure and grouptag once more: + +## ...by exposure +```{r exposure_duration, message=FALSE} +duration = boxes %>% + group_by(exposure) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') +``` + +The time of activity averages at only `r round(mean(duration$duration))` days, +though there are boxes with `r round(max(duration$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by grouptag +```{r grouptag_duration, message=FALSE} +duration = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 8 or more members + filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +duration %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), createdAt, units='days'))) + ) %>% + arrange(desc(duration_avg)) +``` + +The time of activity averages at only `r round(mean(duration$duration))` days, +though there are boxes with `r round(max(duration$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by year of registration +This is less useful, as older boxes are active for a longer time by definition. +If you have an idea how to compensate for that, please send a [Pull Request][PR]! + +```{r year_duration, message=FALSE} +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(updatedAt)) %>% + mutate(duration = difftime(updatedAt, createdAt, units='days')) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') +``` + +# More Visualisations +Other visualisations come to mind, and are left as an exercise to the reader. +If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. + +* growth by phenomenon +* growth by location -> (interactive) map +* set inactive rate in relation to total box count +* filter timespans with big dips in growth rate, and extrapolate the amount of + senseBoxes that could be on the platform today, assuming there were no production issues ;) + +[PR]: https://github.com/noerw/opensensmapr/pulls diff --git a/inst/doc/osem-history.html b/inst/doc/osem-history.html new file mode 100644 index 0000000..1c72d00 --- /dev/null +++ b/inst/doc/osem-history.html @@ -0,0 +1,501 @@ + + + + + + + + + + + + + + + + +Visualising the History of openSenseMap.org + + + + + + + + + + + + + + + + + +

Visualising the History of openSenseMap.org

+

Norwin Roosen

+

2018-05-26

+ + +
+ +
+ +
+

This vignette serves as an example on data wrangling & visualization with opensensmapr, dplyr and ggplot2.

+
+
# required packages:
+library(opensensmapr) # data download
+library(dplyr)        # data wrangling
+library(ggplot2)      # plotting
+library(lubridate)    # date arithmetic
+library(zoo)          # rollmean()
+

openSenseMap.org has grown quite a bit in the last years; it would be interesting to see how we got to the current 1781 sensor stations, split up by various attributes of the boxes.

+

While opensensmapr provides extensive methods of filtering boxes by attributes on the server, we do the filtering within R to save time and gain flexibility. So the first step is to retrieve all the boxes:

+
# if you want to see results for a specific subset of boxes,
+# just specify a filter such as grouptag='ifgi' here
+boxes = osem_boxes()
+
+

Plot count of boxes by time

+

By looking at the createdAt attribute of each box we know the exact time a box was registered. With this approach we have no information about boxes that were deleted in the meantime, but that’s okay for now.

+
+

…and exposure

+
exposure_counts = boxes %>%
+  group_by(exposure) %>%
+  mutate(count = row_number(createdAt))
+
+exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
+ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
+  geom_line() +
+  scale_colour_manual(values = exposure_colors) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+

Outdoor boxes are growing fast! We can also see the introduction of mobile sensor “stations” in 2017. While mobile boxes are still few, we can expect a quick rise in 2018 once the new senseBox MCU with GPS support is released.

+

Let’s have a quick summary:

+
exposure_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
exposureoldestnewestcount
outdoor2015-02-18 16:53:412018-05-26 08:39:121416
indoor2015-02-08 17:36:402018-05-26 10:29:27290
mobile2017-05-24 08:16:362018-05-24 07:08:3255
unknown2014-05-28 15:36:142016-06-25 15:11:1120
+
+
+
+

…and grouptag

+

We can try to find out where the increases in growth came from, by analysing the box count by grouptag.

+

Caveats: Only a small subset of boxes has a grouptag, and we should assume that these groups are actually bigger. Also, we can see that grouptag naming is inconsistent (Luftdaten, luftdaten.info, …)

+
grouptag_counts = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
+  mutate(count = row_number(createdAt))
+
+# helper for sorting the grouptags by boxcount
+sortLvls = function(oldFactor, ascending = TRUE) {
+  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
+  factor(oldFactor, levels = lvls)
+}
+grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
+
+ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
+  geom_line(aes(group = grouptag)) +
+  xlab('Registration Date') + ylab('senseBox count')
+

+
grouptag_counts %>%
+  summarise(
+    oldest = min(createdAt),
+    newest = max(createdAt),
+    count = max(count)
+  ) %>%
+  arrange(desc(count))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagoldestnewestcount
Luftdaten2017-03-14 17:01:162018-05-21 02:20:50109
ifgi2016-06-17 08:04:542018-05-15 10:27:0235
MakeLight2015-02-18 16:53:412018-02-02 13:50:2115
Bad_Hersfeld2017-07-18 13:32:032018-03-22 09:10:0713
luftdaten.info2017-05-01 10:15:442018-05-17 11:47:2112
dwih-sp2016-08-09 08:06:022016-11-23 10:16:0411
Che Aria Tira?2018-03-11 10:50:422018-03-11 23:11:2010
Luftdaten.info2017-04-03 14:10:202018-04-16 16:31:2410
Feinstaub2017-04-08 06:38:252018-03-29 17:27:559
PGKN2018-04-08 07:01:572018-04-27 18:38:519
Raumanmeri2017-03-13 11:35:392017-04-27 05:36:209
Sofia2017-04-11 04:40:112018-03-15 13:26:569
IKG2017-03-21 19:02:112017-12-18 14:30:218
+
+
+
+
+

Plot rate of growth and inactivity per week

+

First we group the boxes by createdAt into bins of one week:

+
bins = 'week'
+mvavg_bins = 6
+
+growth = boxes %>%
+  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'registered')
+

We can do the same for updatedAt, which informs us about the last change to a box, including uploaded measurements. This method of determining inactive boxes is fairly inaccurate and should be considered an approximation, because we have no information about intermediate inactive phases. Also deleted boxes would probably have a big impact here.

+
inactive = boxes %>%
+  # remove boxes that were updated in the last two days,
+  # b/c any box becomes inactive at some point by definition of updatedAt
+  filter(updatedAt < now() - days(2)) %>%
+  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
+  group_by(week) %>%
+  summarize(count = length(week)) %>%
+  mutate(event = 'inactive')
+

Now we can combine both datasets for plotting:

+
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
+
+ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
+  xlab('Time') + ylab(paste('rate per ', bins)) +
+  scale_x_date(date_breaks="years", date_labels="%Y") +
+  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
+  geom_point(aes(y = count), size = 0.5) +
+  # moving average, make first and last value NA (to ensure identical length of vectors)
+  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
+

+

We see a sudden rise in early 2017, which lines up with the fast growing grouptag Luftdaten. This was enabled by an integration of openSenseMap.org into the firmware of the air quality monitoring project luftdaten.info. The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues of the senseBox hardware, but I have no data on the exact time frames to verify.

+
+
+

Plot duration of boxes being active

+

While we are looking at createdAt and updatedAt, we can also extract the duration of activity of each box, and look at metrics by exposure and grouptag once more:

+
+

…by exposure

+
duration = boxes %>%
+  group_by(exposure) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = exposure, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+

The time of activity averages at only 152 days, though there are boxes with 759 days of activity, spanning a large chunk of openSenseMap’s existence.

+
+
+

…by grouptag

+
duration = boxes %>%
+  group_by(grouptag) %>%
+  # only include grouptags with 8 or more members
+  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+  
+ggplot(duration, aes(x = grouptag, y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days')
+

+
duration %>%
+  summarize(
+    duration_avg = round(mean(duration)),
+    duration_min = round(min(duration)),
+    duration_max = round(max(duration)),
+    oldest_box = round(max(difftime(now(), createdAt, units='days')))
+  ) %>%
+  arrange(desc(duration_avg))
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
grouptagduration_avgduration_minduration_maxoldest_box
dwih-sp627 days549 days655 days655 days
Feinstaub219 days4 days413 days413 days
ifgi207 days0 days622 days708 days
Sofia200 days15 days410 days410 days
Bad_Hersfeld197 days65 days312 days312 days
Luftdaten187 days0 days424 days438 days
luftdaten.info183 days9 days360 days390 days
IKG163 days70 days260 days431 days
Luftdaten.info86 days5 days376 days418 days
Che Aria Tira?75 days71 days76 days76 days
Raumanmeri45 days7 days318 days439 days
PGKN35 days29 days48 days48 days
+
+

The time of activity averages at only 191 days, though there are boxes with 655 days of activity, spanning a large chunk of openSenseMap’s existence.

+
+
+

…by year of registration

+

This is less useful, as older boxes are active for a longer time by definition. If you have an idea how to compensate for that, please send a Pull Request!

+
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
+duration = boxes %>%
+  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
+  group_by(year) %>%
+  filter(!is.na(updatedAt)) %>%
+  mutate(duration = difftime(updatedAt, createdAt, units='days'))
+
+ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
+  geom_boxplot() +
+  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
+

+
+
+
+

More Visualisations

+

Other visualisations come to mind, and are left as an exercise to the reader. If you implemented some, feel free to add them to this vignette via a Pull Request.

+ +
+ + + + + + + + diff --git a/inst/doc/osem-intro.Rmd b/inst/doc/osem-intro.Rmd index 9906138..7a2ad09 100644 --- a/inst/doc/osem-intro.Rmd +++ b/inst/doc/osem-intro.Rmd @@ -1,5 +1,5 @@ --- -title: "Analyzing environmental sensor data from openSenseMap.org in R" +title: "Exploring the openSenseMap Dataset" author: "Norwin Roosen" date: "`r Sys.Date()`" output: @@ -8,7 +8,7 @@ output: fig_width: 6 fig_height: 4 vignette: > - %\VignetteIndexEntry{Analyzing environmental sensor data from openSenseMap.org in R} + %\VignetteIndexEntry{Exploring the openSenseMap Dataset} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -17,8 +17,6 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) ``` -## Analyzing environmental sensor data from openSenseMap.org in R - This package provides data ingestion functions for almost any data stored on the open data platform for environemental sensordata . Its main goals are to provide means for: diff --git a/inst/doc/osem-intro.html b/inst/doc/osem-intro.html index 32e4d1f..5835840 100644 --- a/inst/doc/osem-intro.html +++ b/inst/doc/osem-intro.html @@ -1,242 +1,103 @@ - - - - -Analyzing environmental sensor data from openSenseMap.org in R - - - - - + - - +Exploring the openSenseMap Dataset + - tr, img { - page-break-inside: avoid; - } - img { - max-width: 100% !important; - } - @page :left { - margin: 15mm 20mm 15mm 10mm; - } + - @page :right { - margin: 15mm 10mm 15mm 20mm; - } + - p, h2, h3 { - orphans: 3; widows: 3; - } + - h2, h3 { - page-break-after: avoid; - } -} - - +

Exploring the openSenseMap Dataset

+

Norwin Roosen

+

2018-05-26

- -

Analyzing environmental sensor data from openSenseMap.org in R

-

This package provides data ingestion functions for almost any data stored on the -open data platform for environemental sensordata https://opensensemap.org. -Its main goals are to provide means for:

+

This package provides data ingestion functions for almost any data stored on the open data platform for environemental sensordata https://opensensemap.org. Its main goals are to provide means for:

- +

Exploring the dataset

+

Before we look at actual observations, lets get a grasp of the openSenseMap datasets’ structure.

+
library(magrittr)
+library(opensensmapr)
 
-

Before we look at actual observations, lets get a grasp of the openSenseMap -datasets' structure.

- -
library(magrittr)
-library(opensensmapr)
-
-all_sensors = osem_boxes()
-
- -
summary(all_sensors)
-
- -
## boxes total: 1779
+all_sensors = osem_boxes()
+
summary(all_sensors)
+
## boxes total: 1781
 ## 
 ## boxes by exposure:
 ##  indoor  mobile outdoor unknown 
-##     288      55    1416      20 
+##     290      55    1416      20 
 ## 
 ## boxes by model:
 ##                   custom             homeEthernet    homeEthernetFeinstaub 
-##                      335                       92                       49 
+##                      336                       92                       49 
 ##                 homeWifi        homeWifiFeinstaub        luftdaten_pms1003 
-##                      192                      144                        1 
+##                      193                      144                        1 
 ## luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 luftdaten_pms7003_bme280 
 ##                        1                        5                        2 
 ##         luftdaten_sds011  luftdaten_sds011_bme280  luftdaten_sds011_bmp180 
@@ -246,52 +107,34 @@ all_sensors = osem_boxes()
 ## 
 ## $last_measurement_within
 ##    1h    1d   30d  365d never 
-##   921   960  1089  1427   235 
+##   929   954  1091  1428   235 
 ## 
 ## oldest box: 2014-05-28 15:36:14 (CALIMERO)
-## newest box: 2018-05-24 20:29:50 (Stadthalle)
+## newest box: 2018-05-26 10:29:27 (UOS_DDI)
 ## 
 ## sensors per box:
 ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   1.000   4.000   4.000   4.601   5.000  33.000
-
- -

This gives a good overview already: As of writing this, there are more than 700 -sensor stations, of which ~50% are currently running. Most of them are placed -outdoors and have around 5 sensors each. -The oldest station is from May 2014, while the latest station was registered a -couple of minutes ago.

- -

Another feature of interest is the spatial distribution of the boxes: plot() -can help us out here. This function requires a bunch of optional dependencies though.

- -
if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(all_sensors)
-
- -

plot of chunk unnamed-chunk-3

- +## 1.0 4.0 4.0 4.6 5.0 33.0
+

This gives a good overview already: As of writing this, there are more than 700 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago.

+

Another feature of interest is the spatial distribution of the boxes: plot() can help us out here. This function requires a bunch of optional dependencies though.

+
if (!require('maps'))     install.packages('maps')
+if (!require('maptools')) install.packages('maptools')
+if (!require('rgeos'))    install.packages('rgeos')
+
+plot(all_sensors)
+

It seems we have to reduce our area of interest to Germany.

- -

But what do these sensor stations actually measure? Lets find out. -osem_phenomena() gives us a named list of of the counts of each observed -phenomenon for the given set of sensor stations:

- -
phenoms = osem_phenomena(all_sensors)
-str(phenoms)
-
- -
## List of 432
-##  $ Temperatur                                      : int 1607
-##  $ rel. Luftfeuchte                                : int 1421
+

But what do these sensor stations actually measure? Lets find out. osem_phenomena() gives us a named list of of the counts of each observed phenomenon for the given set of sensor stations:

+
phenoms = osem_phenomena(all_sensors)
+str(phenoms)
+
## List of 433
+##  $ Temperatur                                      : int 1608
+##  $ rel. Luftfeuchte                                : int 1422
 ##  $ PM10                                            : int 1200
 ##  $ PM2.5                                           : int 1198
-##  $ Luftdruck                                       : int 824
-##  $ Beleuchtungsstärke                              : int 480
-##  $ UV-Intensität                                   : int 471
+##  $ Luftdruck                                       : int 825
+##  $ Beleuchtungsstärke                              : int 481
+##  $ UV-Intensität                                   : int 472
 ##  $ Luftfeuchtigkeit                                : int 84
 ##  $ Temperature                                     : int 49
 ##  $ Humidity                                        : int 42
@@ -309,10 +152,10 @@ str(phenoms)
 ##  $ Feinstaub PM10                                  : int 10
 ##  $ Feinstaub PM2.5                                 : int 9
 ##  $ Kosteus                                         : int 8
+##  $ Temperatur DHT22                                : int 8
 ##  $ Valonmäärä                                      : int 8
 ##  $ temperature                                     : int 8
 ##  $ PM01                                            : int 7
-##  $ Temperatur DHT22                                : int 7
 ##  $ UV-säteily                                      : int 7
 ##  $ Niederschlag                                    : int 6
 ##  $ UV-Strahlung                                    : int 6
@@ -330,6 +173,7 @@ str(phenoms)
 ##  $ UV-Säteily                                      : int 4
 ##  $ lautstärke                                      : int 4
 ##  $ rel. Luftfeuchte 1                              : int 4
+##  $ rel. Luftfeuchte DHT22                          : int 4
 ##  $ relative Luftfeuchtigkeit                       : int 4
 ##  $ Air pressure                                    : int 3
 ##  $ Batterie                                        : int 3
@@ -352,7 +196,6 @@ str(phenoms)
 ##  $ Valoisuus                                       : int 3
 ##  $ Wind Gust                                       : int 3
 ##  $ pressure                                        : int 3
-##  $ rel. Luftfeuchte DHT22                          : int 3
 ##  $ 1                                               : int 2
 ##  $ 10                                              : int 2
 ##  $ 2                                               : int 2
@@ -384,22 +227,14 @@ str(phenoms)
 ##  $ Sound                                           : int 2
 ##  $ Temperatur (DHT22)                              : int 2
 ##  $ Temperatur BMP180                               : int 2
-##   [list output truncated]
-
- -

Thats quite some noise there, with many phenomena being measured by a single -sensor only, or many duplicated phenomena due to slightly different spellings. -We should clean that up, but for now let's just filter out the noise and find -those phenomena with high sensor numbers:

- -
phenoms[phenoms > 20]
-
- +## [list output truncated]
+

Thats quite some noise there, with many phenomena being measured by a single sensor only, or many duplicated phenomena due to slightly different spellings. We should clean that up, but for now let’s just filter out the noise and find those phenomena with high sensor numbers:

+
phenoms[phenoms > 20]
## $Temperatur
-## [1] 1607
+## [1] 1608
 ## 
 ## $`rel. Luftfeuchte`
-## [1] 1421
+## [1] 1422
 ## 
 ## $PM10
 ## [1] 1200
@@ -408,13 +243,13 @@ those phenomena with high sensor numbers:

## [1] 1198 ## ## $Luftdruck -## [1] 824 +## [1] 825 ## ## $Beleuchtungsstärke -## [1] 480 +## [1] 481 ## ## $`UV-Intensität` -## [1] 471 +## [1] 472 ## ## $Luftfeuchtigkeit ## [1] 84 @@ -429,159 +264,99 @@ those phenomena with high sensor numbers:

## [1] 25 ## ## $Lautstärke -## [1] 21 -
- -

Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more -interesting to analyze though. -We should check how many sensor stations provide useful data: We want only those -boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting -measurements:

- -
pm25_sensors = osem_boxes(
-  exposure = 'outdoor',
-  date = Sys.time(), # ±4 hours
-  phenomenon = 'PM2.5'
-)
-
- -
summary(pm25_sensors)
-
- -
## boxes total: 788
+## [1] 21
+

Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more interesting to analyze though. We should check how many sensor stations provide useful data: We want only those boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting measurements:

+
pm25_sensors = osem_boxes(
+  exposure = 'outdoor',
+  date = Sys.time(), # ±4 hours
+  phenomenon = 'PM2.5'
+)
+
summary(pm25_sensors)
+
## boxes total: 791
 ## 
 ## boxes by exposure:
 ## outdoor 
-##     788 
+##     791 
 ## 
 ## boxes by model:
 ##                   custom    homeEthernetFeinstaub                 homeWifi 
-##                       28                       37                        6 
+##                       29                       37                        6 
 ##        homeWifiFeinstaub luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 
-##                       57                        1                        2 
+##                       57                        1                        1 
 ## luftdaten_pms7003_bme280         luftdaten_sds011  luftdaten_sds011_bme280 
-##                        2                       33                      135 
+##                        2                       32                      137 
 ##  luftdaten_sds011_bmp180   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
-##                       14                       31                      442 
+##                       14                       32                      443 
 ## 
 ## $last_measurement_within
 ##    1h    1d   30d  365d never 
-##   764   777   780   785     3 
+##   771   780   784   789     2 
 ## 
 ## oldest box: 2016-06-02 12:09:47 (BalkonBox Mindener Str.)
 ## newest box: 2018-05-24 20:29:50 (Stadthalle)
 ## 
 ## sensors per box:
 ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   2.000   4.000   4.000   4.615   5.000  12.000
-
- -
plot(pm25_sensors)
-
- -

plot of chunk unnamed-chunk-7

- +## 2.000 4.000 4.000 4.617 5.000 12.000 +
plot(pm25_sensors)
+

Thats still more than 200 measuring stations, we can work with that.

- +
+

Analyzing sensor data

- -

Having analyzed the available data sources, let's finally get some measurements. -We could call osem_measurements(pm25_sensors) now, however we are focussing on -a restricted area of interest, the city of Berlin. -Luckily we can get the measurements filtered by a bounding box:

- -
library(sf)
-
- -
## Linking to GEOS 3.6.1, GDAL 2.1.4, proj.4 4.9.3
-
- -
library(units)
-library(lubridate)
-
- -
## 
-## Attaching package: 'lubridate'
-
- -
## The following object is masked from 'package:base':
-## 
-##     date
-
- -
library(dplyr)
-
- +

Having analyzed the available data sources, let’s finally get some measurements. We could call osem_measurements(pm25_sensors) now, however we are focussing on a restricted area of interest, the city of Berlin. Luckily we can get the measurements filtered by a bounding box:

+
library(sf)
+
## Linking to GEOS 3.5.1, GDAL 2.2.2, proj.4 4.9.2
+
library(units)
## 
-## Attaching package: 'dplyr'
-
- -
## The following objects are masked from 'package:lubridate':
-## 
-##     intersect, setdiff, union
-
- -
## The following objects are masked from 'package:rgeos':
+## Attaching package: 'units'
+
## The following object is masked from 'package:base':
 ## 
-##     intersect, setdiff, union
-
- -
## The following objects are masked from 'package:stats':
-## 
-##     filter, lag
-
- -
## The following objects are masked from 'package:base':
-## 
-##     intersect, setdiff, setequal, union
-
- -
# construct a bounding box: 12 kilometers around Berlin
-berlin = st_point(c(13.4034, 52.5120)) %>%
-  st_sfc(crs = 4326) %>%
-  st_transform(3857) %>% # allow setting a buffer in meters
-  st_buffer(set_units(12, km)) %>%
-  st_transform(4326) %>% # the opensensemap expects WGS 84
-  st_bbox()
-
- -
pm25 = osem_measurements(
+##     %*%
+
library(lubridate)
+library(dplyr)
+
+# construct a bounding box: 12 kilometers around Berlin
+berlin = st_point(c(13.4034, 52.5120)) %>%
+  st_sfc(crs = 4326) %>%
+  st_transform(3857) %>% # allow setting a buffer in meters
+  st_buffer(set_units(12, km)) %>%
+  st_transform(4326) %>% # the opensensemap expects WGS 84
+  st_bbox()
+
pm25 = osem_measurements(
   berlin,
-  phenomenon = 'PM2.5',
-  from = now() - days(20), # defaults to 2 days
-  to = now()
+  phenomenon = 'PM2.5',
+  from = now() - days(20), # defaults to 2 days
+  to = now()
 )
 
-plot(pm25)
-
- -

plot of chunk unnamed-chunk-9

- -

Now we can get started with actual spatiotemporal data analysis. -First, lets mask the seemingly uncalibrated sensors:

- -
outliers = filter(pm25, value > 100)$sensorId
-bad_sensors = outliers[, drop = T] %>% levels()
-
-pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
-
+plot(pm25)
+

+

Now we can get started with actual spatiotemporal data analysis. First, lets mask the seemingly uncalibrated sensors:

+
outliers = filter(pm25, value > 100)$sensorId
+bad_sensors = outliers[, drop = T] %>% levels()
 
+pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)

Then plot the measuring locations, flagging the outliers:

- -
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
-
- -

plot of chunk unnamed-chunk-11

- +
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
+

Removing these sensors yields a nicer time series plot:

+
pm25 %>% filter(invalid == FALSE) %>% plot()
+

+

Further analysis: comparison with LANUV data TODO

+
-
pm25 %>% filter(invalid == FALSE) %>% plot()
-
-

plot of chunk unnamed-chunk-12

-

Further analysis: comparison with LANUV data TODO

+ + - diff --git a/inst/doc/osem-serialization.Rmd b/inst/doc/osem-serialization.Rmd index f7812af..9a8d676 100644 --- a/inst/doc/osem-serialization.Rmd +++ b/inst/doc/osem-serialization.Rmd @@ -1,10 +1,10 @@ --- -title: "opensensmapr reproducibility: Loading openSenseMap Data from Files" +title: "Caching openSenseMap Data for Reproducibility" author: "Norwin Roosen" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{opensensmapr reproducibility: Loading openSenseMap Data from Files} + %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/inst/doc/osem-serialization.html b/inst/doc/osem-serialization.html index 448348f..6b8ddf0 100644 --- a/inst/doc/osem-serialization.html +++ b/inst/doc/osem-serialization.html @@ -1,379 +1,186 @@ - - - - -Using openSensMapr Caching Feature - - - - - - - - - - - + -pre, img { - max-width: 100%; -} -pre { - overflow-x: auto; -} -pre code { - display: block; padding: 0.5em; -} -code { - font-size: 92%; - border: 1px solid #ccc; -} -code[class] { - background-color: #F8F8F8; -} + -table, td, th { - border: none; -} + -blockquote { - color:#666666; - margin:0; - padding-left: 1em; - border-left: 0.5em #EEE solid; -} + -hr { - height: 0px; - border-bottom: none; - border-top-width: thin; - border-top-style: dotted; - border-top-color: #999999; -} -@media print { - * { - background: transparent !important; - color: black !important; - filter:none !important; - -ms-filter: none !important; - } - - body { - font-size:12pt; - max-width:100%; - } - - a, a:visited { - text-decoration: underline; - } - - hr { - visibility: hidden; - page-break-before: always; - } - - pre, blockquote { - padding-right: 1em; - page-break-inside: avoid; - } - - tr, img { - page-break-inside: avoid; - } - - img { - max-width: 100% !important; - } - - @page :left { - margin: 15mm 20mm 15mm 10mm; - } - - @page :right { - margin: 15mm 10mm 15mm 20mm; - } - - p, h2, h3 { - orphans: 3; widows: 3; - } - - h2, h3 { - page-break-after: avoid; - } -} - +

Caching openSenseMap Data for Reproducibility

+

Norwin Roosen

+

2018-05-26

- - -

It may be useful to download data from openSenseMap only once. -For reproducible results, the data could be saved to disk, and reloaded at a -later point.

+

It may be useful to download data from openSenseMap only once. For reproducible results, the data could be saved to disk, and reloaded at a later point.

This avoids..

- - -

This vignette shows how to use this built in opensensmapr feature, and -how to do it yourself, if you want to store to other data formats.

- +

This vignette shows how to use this built in opensensmapr feature, and how to do it yourself, if you want to store to other data formats.

+

Using openSensMapr Caching Feature

- -

All data retrieval functions of opensensmapr have a built in caching feature, -which serializes an API response to disk. -Subsequent identical requests will then return the serialized data instead of making -another request. -To do so, each request is given a unique ID based on its parameters.

- +

All data retrieval functions of opensensmapr have a built in caching feature, which serializes an API response to disk. Subsequent identical requests will then return the serialized data instead of making another request. To do so, each request is given a unique ID based on its parameters.

To use this feature, just add a path to a directory to the cache parameter:

- -
b = osem_boxes(cache = tempdir())
-list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
-
- -
## [1] "osemcache.c54710f66b662e29dd86b089962b0f598e47eddb.rds"
-
- -
# the next identical request will hit the cache only!
-b = osem_boxes(cache = tempdir())
-
-# requests without the cache parameter will still be performed normally
-b = osem_boxes()
-
- -

You can maintain multiple caches simultaneously which allows to store only -serialized data related to a script in its directory:

- -
cacheDir = getwd() # current working directory
-b = osem_boxes(cache = cacheDir)
-
-# the next identical request will hit the cache only!
-b = osem_boxes(cache = cacheDir)
-
- +
b = osem_boxes(cache = tempdir())
+list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
+
## [1] "osemcache.c54710f66b662e29dd86b089962b0f598e47eddb.rds"
+
# the next identical request will hit the cache only!
+b = osem_boxes(cache = tempdir())
+
+# requests without the cache parameter will still be performed normally
+b = osem_boxes()
+

You can maintain multiple caches simultaneously which allows to store only serialized data related to a script in its directory:

+
cacheDir = getwd() # current working directory
+b = osem_boxes(cache = cacheDir)
+
+# the next identical request will hit the cache only!
+b = osem_boxes(cache = cacheDir)

To get fresh results again, just call osem_clear_cache() for the respective cache:

- -
osem_clear_cache() # clears default cache
-
- -
## [1] TRUE
-
- -
osem_clear_cache(getwd()) # clears a custom cache
-
- -
## [1] TRUE
-
- +
osem_clear_cache() # clears default cache
+
## [1] TRUE
+
osem_clear_cache(getwd()) # clears a custom cache
+
## [1] TRUE
+
+

Custom (De-) Serialization

- -

If you want to roll your own serialization method to support custom data formats, -here's how:

- -
# this section requires:
-library(opensensmapr)
-library(jsonlite)
-library(readr)
-
-# first get our example data:
-boxes = osem_boxes(grouptag = 'ifgi')
-measurements = osem_measurements(boxes, phenomenon = 'PM10')
-
- -

If you are paranoid and worry about .rds files not being decodable anymore -in the (distant) future, you could serialize to a plain text format such as JSON. -This of course comes at the cost of storage space and performance.

- -
# serializing senseBoxes to JSON, and loading from file again:
-write(jsonlite::serializeJSON(measurements), 'boxes.json')
-boxes_from_file = jsonlite::unserializeJSON(readr::read_file('boxes.json'))
-
- -

Both methods also persist the R object metadata (classes, attributes). -If you were to use a serialization method that can't persist object metadata, you -could re-apply it with the following functions:

- -
# note the toJSON call
-write(jsonlite::toJSON(measurements), 'boxes_bad.json')
-boxes_without_attrs = jsonlite::fromJSON('boxes_bad.json')
-
-boxes_with_attrs = osem_as_sensebox(boxes_without_attrs)
-class(boxes_with_attrs)
-
- -
## [1] "sensebox"   "data.frame"
-
- +

If you want to roll your own serialization method to support custom data formats, here’s how:

+
# this section requires:
+library(opensensmapr)
+library(jsonlite)
+library(readr)
+
+# first get our example data:
+boxes = osem_boxes(grouptag = 'ifgi')
+measurements = osem_measurements(boxes, phenomenon = 'PM10')
+

If you are paranoid and worry about .rds files not being decodable anymore in the (distant) future, you could serialize to a plain text format such as JSON. This of course comes at the cost of storage space and performance.

+
# serializing senseBoxes to JSON, and loading from file again:
+write(jsonlite::serializeJSON(measurements), 'boxes.json')
+boxes_from_file = jsonlite::unserializeJSON(readr::read_file('boxes.json'))
+

Both methods also persist the R object metadata (classes, attributes). If you were to use a serialization method that can’t persist object metadata, you could re-apply it with the following functions:

+
# note the toJSON call
+write(jsonlite::toJSON(measurements), 'boxes_bad.json')
+boxes_without_attrs = jsonlite::fromJSON('boxes_bad.json')
+
+boxes_with_attrs = osem_as_sensebox(boxes_without_attrs)
+class(boxes_with_attrs)
+
## [1] "sensebox"   "data.frame"

The same goes for measurements via osem_as_measurements().

- +
+

Workflow for reproducible code

- -

For truly reproducible code you want it to work and return the same results – -no matter if you run it the first time or a consecutive time, and without making -changes to it.

- -

Therefore we need a wrapper around the save-to-file & load-from-file logic. -The following examples show a way to do just that, and where inspired by -this reproducible analysis by Daniel Nuest.

- -
# offline logic
-osem_offline = function (func, file, format='rds', ...) {
-  # deserialize if file exists, otherwise download and serialize
-  if (file.exists(file)) {
-    if (format == 'json')
-      jsonlite::unserializeJSON(readr::read_file(file))
+

For truly reproducible code you want it to work and return the same results – no matter if you run it the first time or a consecutive time, and without making changes to it.

+

Therefore we need a wrapper around the save-to-file & load-from-file logic. The following examples show a way to do just that, and where inspired by this reproducible analysis by Daniel Nuest.

+
# offline logic
+osem_offline = function (func, file, format='rds', ...) {
+  # deserialize if file exists, otherwise download and serialize
+  if (file.exists(file)) {
+    if (format == 'json')
+      jsonlite::unserializeJSON(readr::read_file(file))
     else
-      readRDS(file)
+      readRDS(file)
   } else {
-    data = func(...)
-    if (format == 'json')
-      write(jsonlite::serializeJSON(data), file = file)
+    data = func(...)
+    if (format == 'json')
+      write(jsonlite::serializeJSON(data), file = file)
     else
-      saveRDS(data, file)
+      saveRDS(data, file)
     data
   }
 }
 
-# wrappers for each download function
-osem_measurements_offline = function (file, ...) {
-  osem_offline(opensensmapr::osem_measurements, file, ...)
-}
-osem_boxes_offline = function (file, ...) {
-  osem_offline(opensensmapr::osem_boxes, file, ...)
-}
-osem_box_offline = function (file, ...) {
-  osem_offline(opensensmapr::osem_box, file, ...)
-}
-osem_counts_offline = function (file, ...) {
-  osem_offline(opensensmapr::osem_counts, file, ...)
-}
-
- -

Thats it! Now let's try it out:

- -
# first run; will download and save to disk
-b1 = osem_boxes_offline('mobileboxes.rds', exposure='mobile')
-
-# consecutive runs; will read from disk
-b2 = osem_boxes_offline('mobileboxes.rds', exposure='mobile')
-class(b1) == class(b2)
-
- -
## [1] TRUE TRUE
-
- -
# we can even omit the arguments now (though thats not really the point here)
-b3 = osem_boxes_offline('mobileboxes.rds')
-nrow(b1) == nrow(b3)
-
- -
## [1] TRUE
-
- -
# verify that the custom sensebox methods are still working
-summary(b2)
-
- +# wrappers for each download function +osem_measurements_offline = function (file, ...) { + osem_offline(opensensmapr::osem_measurements, file, ...) +} +osem_boxes_offline = function (file, ...) { + osem_offline(opensensmapr::osem_boxes, file, ...) +} +osem_box_offline = function (file, ...) { + osem_offline(opensensmapr::osem_box, file, ...) +} +osem_counts_offline = function (file, ...) { + osem_offline(opensensmapr::osem_counts, file, ...) +}
+

Thats it! Now let’s try it out:

+
# first run; will download and save to disk
+b1 = osem_boxes_offline('mobileboxes.rds', exposure='mobile')
+
+# consecutive runs; will read from disk
+b2 = osem_boxes_offline('mobileboxes.rds', exposure='mobile')
+class(b1) == class(b2)
+
## [1] TRUE TRUE
+
# we can even omit the arguments now (though thats not really the point here)
+b3 = osem_boxes_offline('mobileboxes.rds')
+nrow(b1) == nrow(b3)
+
## [1] TRUE
+
# verify that the custom sensebox methods are still working
+summary(b2)
## boxes total: 55
 ## 
 ## boxes by exposure:
@@ -390,35 +197,34 @@ summary(b2)
 ## 
 ## $last_measurement_within
 ##    1h    1d   30d  365d never 
-##    16    18    24    43    12 
+##    16    16    24    43    12 
 ## 
 ## oldest box: 2017-05-24 08:16:36 (Feinstaub Hauptstrasse Steampunk-Design)
 ## newest box: 2018-05-24 07:08:32 (Josi Test)
 ## 
 ## sensors per box:
 ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   1.000   4.000   4.000   4.618   5.000  22.000
-
- -
plot(b3)
-
- -

plot of chunk test

- +## 1.000 4.000 4.000 4.618 5.000 22.000 +
plot(b3)
+

To re-download the data, just clear the files that were created in the process:

+
file.remove('mobileboxes.rds', 'boxes_bad.json', 'boxes.json', 'measurements.rds')
+
## Warning in file.remove("mobileboxes.rds", "boxes_bad.json", "boxes.json", :
+## cannot remove file 'measurements.rds', reason 'No such file or directory'
+

A possible extension to this scheme comes to mind: Omit the specification of a filename, and assign a unique ID to the request instead. For example, one could calculate the SHA-1 hash of the parameters, and use it as filename.

+ -
file.remove('mobileboxes.rds', 'boxes_bad.json', 'boxes.json', 'measurements.rds')
-
-
## Warning in file.remove("mobileboxes.rds", "boxes_bad.json", "boxes.json", :
-## cannot remove file 'measurements.rds', reason 'No such file or directory'
-
-

A possible extension to this scheme comes to mind: Omit the specification of a -filename, and assign a unique ID to the request instead. -For example, one could calculate the SHA-1 hash of the parameters, and use it -as filename.

+ + - diff --git a/vignettes/osem-history.Rmd b/vignettes/osem-history.Rmd index ca7af72..ff9f3d3 100644 --- a/vignettes/osem-history.Rmd +++ b/vignettes/osem-history.Rmd @@ -14,8 +14,10 @@ output: theme: lumen toc: yes toc_float: yes -vignette: | - %\VignetteIndexEntry{Visualising the History of openSenseMap.org} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} +vignette: > + %\VignetteIndexEntry{Visualising the History of openSenseMap.org} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} --- > This vignette serves as an example on data wrangling & visualization with diff --git a/vignettes/osem-intro.Rmd b/vignettes/osem-intro.Rmd index 9906138..7a2ad09 100644 --- a/vignettes/osem-intro.Rmd +++ b/vignettes/osem-intro.Rmd @@ -1,5 +1,5 @@ --- -title: "Analyzing environmental sensor data from openSenseMap.org in R" +title: "Exploring the openSenseMap Dataset" author: "Norwin Roosen" date: "`r Sys.Date()`" output: @@ -8,7 +8,7 @@ output: fig_width: 6 fig_height: 4 vignette: > - %\VignetteIndexEntry{Analyzing environmental sensor data from openSenseMap.org in R} + %\VignetteIndexEntry{Exploring the openSenseMap Dataset} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -17,8 +17,6 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) ``` -## Analyzing environmental sensor data from openSenseMap.org in R - This package provides data ingestion functions for almost any data stored on the open data platform for environemental sensordata . Its main goals are to provide means for: diff --git a/vignettes/osem-serialization.Rmd b/vignettes/osem-serialization.Rmd index f7812af..9a8d676 100644 --- a/vignettes/osem-serialization.Rmd +++ b/vignettes/osem-serialization.Rmd @@ -1,10 +1,10 @@ --- -title: "opensensmapr reproducibility: Loading openSenseMap Data from Files" +title: "Caching openSenseMap Data for Reproducibility" author: "Norwin Roosen" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{opensensmapr reproducibility: Loading openSenseMap Data from Files} + %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---