From 62667ef139f7f7eb5c5f45e6929e77ed379c9686 Mon Sep 17 00:00:00 2001 From: jan Date: Thu, 23 Feb 2023 14:07:21 +0100 Subject: [PATCH] remove inst/doc files --- inst/doc/osem-history.R | 133 -- inst/doc/osem-history.Rmd | 243 --- inst/doc/osem-history.html | 501 ------ inst/doc/osem-history_revised.Rmd | 302 ---- inst/doc/osem-history_revised.html | 2493 ---------------------------- inst/doc/osem-intro.R | 73 - inst/doc/osem-intro.Rmd | 151 -- inst/doc/osem-intro.html | 362 ---- inst/doc/osem-serialization.R | 51 - inst/doc/osem-serialization.Rmd | 106 -- inst/doc/osem-serialization.html | 152 -- 11 files changed, 4567 deletions(-) delete mode 100644 inst/doc/osem-history.R delete mode 100644 inst/doc/osem-history.Rmd delete mode 100644 inst/doc/osem-history.html delete mode 100644 inst/doc/osem-history_revised.Rmd delete mode 100644 inst/doc/osem-history_revised.html delete mode 100644 inst/doc/osem-intro.R delete mode 100644 inst/doc/osem-intro.Rmd delete mode 100644 inst/doc/osem-intro.html delete mode 100644 inst/doc/osem-serialization.R delete mode 100644 inst/doc/osem-serialization.Rmd delete mode 100644 inst/doc/osem-serialization.html diff --git a/inst/doc/osem-history.R b/inst/doc/osem-history.R deleted file mode 100644 index 09048eb..0000000 --- a/inst/doc/osem-history.R +++ /dev/null @@ -1,133 +0,0 @@ -## ----setup, results='hide', message=FALSE, warning=FALSE----------------- -# required packages: -library(opensensmapr) # data download -library(dplyr) # data wrangling -library(ggplot2) # plotting -library(lubridate) # date arithmetic -library(zoo) # rollmean() - -## ----download------------------------------------------------------------ -# if you want to see results for a specific subset of boxes, -# just specify a filter such as grouptag='ifgi' here -boxes = osem_boxes() - -## ----exposure_counts, message=FALSE-------------------------------------- -exposure_counts = boxes %>% - group_by(exposure) %>% - mutate(count = row_number(createdAt)) - -exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') -ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + - geom_line() + - scale_colour_manual(values = exposure_colors) + - xlab('Registration Date') + ylab('senseBox count') - -## ----exposure_summary---------------------------------------------------- -exposure_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) - -## ----grouptag_counts, message=FALSE-------------------------------------- -grouptag_counts = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% - mutate(count = row_number(createdAt)) - -# helper for sorting the grouptags by boxcount -sortLvls = function(oldFactor, ascending = TRUE) { - lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() - factor(oldFactor, levels = lvls) -} -grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) - -ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + - geom_line(aes(group = grouptag)) + - xlab('Registration Date') + ylab('senseBox count') - -## ----grouptag_summary---------------------------------------------------- -grouptag_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) - -## ----growthrate_registered, warning=FALSE, message=FALSE, results='hide'---- -bins = 'week' -mvavg_bins = 6 - -growth = boxes %>% - mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'registered') - -## ----growthrate_inactive, warning=FALSE, message=FALSE, results='hide'---- -inactive = boxes %>% - # remove boxes that were updated in the last two days, - # b/c any box becomes inactive at some point by definition of updatedAt - filter(updatedAt < now() - days(2)) %>% - mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'inactive') - -## ----growthrate, warning=FALSE, message=FALSE, results='hide'------------ -boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) - -ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + - xlab('Time') + ylab(paste('rate per ', bins)) + - scale_x_date(date_breaks="years", date_labels="%Y") + - scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + - geom_point(aes(y = count), size = 0.5) + - # moving average, make first and last value NA (to ensure identical length of vectors) - geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) - -## ----exposure_duration, message=FALSE------------------------------------ -duration = boxes %>% - group_by(exposure) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = exposure, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -## ----grouptag_duration, message=FALSE------------------------------------ -duration = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = grouptag, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -duration %>% - summarize( - duration_avg = round(mean(duration)), - duration_min = round(min(duration)), - duration_max = round(max(duration)), - oldest_box = round(max(difftime(now(), createdAt, units='days'))) - ) %>% - arrange(desc(duration_avg)) - -## ----year_duration, message=FALSE---------------------------------------- -# NOTE: boxes older than 2016 missing due to missing updatedAt in database -duration = boxes %>% - mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% - group_by(year) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') - diff --git a/inst/doc/osem-history.Rmd b/inst/doc/osem-history.Rmd deleted file mode 100644 index 12aff6a..0000000 --- a/inst/doc/osem-history.Rmd +++ /dev/null @@ -1,243 +0,0 @@ ---- -title: "Visualising the History of openSenseMap.org" -author: "Norwin Roosen" -date: '`r Sys.Date()`' -output: - rmarkdown::html_vignette: - df_print: kable - fig_height: 5 - fig_width: 7 - toc: yes - html_document: - code_folding: hide - df_print: kable - theme: lumen - toc: yes - toc_float: yes -vignette: > - %\VignetteIndexEntry{Visualising the History of openSenseMap.org} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -> This vignette serves as an example on data wrangling & visualization with -`opensensmapr`, `dplyr` and `ggplot2`. - -```{r setup, results='hide', message=FALSE, warning=FALSE} -# required packages: -library(opensensmapr) # data download -library(dplyr) # data wrangling -library(ggplot2) # plotting -library(lubridate) # date arithmetic -library(zoo) # rollmean() -``` - -openSenseMap.org has grown quite a bit in the last years; it would be interesting -to see how we got to the current `r osem_counts()$boxes` sensor stations, -split up by various attributes of the boxes. - -While `opensensmapr` provides extensive methods of filtering boxes by attributes -on the server, we do the filtering within R to save time and gain flexibility. -So the first step is to retrieve *all the boxes*: - -```{r download} -# if you want to see results for a specific subset of boxes, -# just specify a filter such as grouptag='ifgi' here -boxes = osem_boxes() -``` - -# Plot count of boxes by time {.tabset} -By looking at the `createdAt` attribute of each box we know the exact time a box -was registered. -With this approach we have no information about boxes that were deleted in the -meantime, but that's okay for now. - -## ...and exposure -```{r exposure_counts, message=FALSE} -exposure_counts = boxes %>% - group_by(exposure) %>% - mutate(count = row_number(createdAt)) - -exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') -ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) + - geom_line() + - scale_colour_manual(values = exposure_colors) + - xlab('Registration Date') + ylab('senseBox count') -``` - -Outdoor boxes are growing *fast*! -We can also see the introduction of `mobile` sensor "stations" in 2017. While -mobile boxes are still few, we can expect a quick rise in 2018 once the new -[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU). - -Let's have a quick summary: -```{r exposure_summary} -exposure_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -## ...and grouptag -We can try to find out where the increases in growth came from, by analysing the -box count by grouptag. - -Caveats: Only a small subset of boxes has a grouptag, and we should assume -that these groups are actually bigger. Also, we can see that grouptag naming is -inconsistent (`Luftdaten`, `luftdaten.info`, ...) - -```{r grouptag_counts, message=FALSE} -grouptag_counts = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag)) %>% - mutate(count = row_number(createdAt)) - -# helper for sorting the grouptags by boxcount -sortLvls = function(oldFactor, ascending = TRUE) { - lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() - factor(oldFactor, levels = lvls) -} -grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) - -ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) + - geom_line(aes(group = grouptag)) + - xlab('Registration Date') + ylab('senseBox count') -``` - -```{r grouptag_summary} -grouptag_counts %>% - summarise( - oldest = min(createdAt), - newest = max(createdAt), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -# Plot rate of growth and inactivity per week -First we group the boxes by `createdAt` into bins of one week: -```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} -bins = 'week' -mvavg_bins = 6 - -growth = boxes %>% - mutate(week = cut(as.Date(createdAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'registered') -``` - -We can do the same for `updatedAt`, which informs us about the last change to -a box, including uploaded measurements. -This method of determining inactive boxes is fairly inaccurate and should be -considered an approximation, because we have no information about intermediate -inactive phases. -Also deleted boxes would probably have a big impact here. -```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} -inactive = boxes %>% - # remove boxes that were updated in the last two days, - # b/c any box becomes inactive at some point by definition of updatedAt - filter(updatedAt < now() - days(2)) %>% - mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'inactive') -``` - -Now we can combine both datasets for plotting: -```{r growthrate, warning=FALSE, message=FALSE, results='hide'} -boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) - -ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + - xlab('Time') + ylab(paste('rate per ', bins)) + - scale_x_date(date_breaks="years", date_labels="%Y") + - scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + - geom_point(aes(y = count), size = 0.5) + - # moving average, make first and last value NA (to ensure identical length of vectors) - geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) -``` - -We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`. -This was enabled by an integration of openSenseMap.org into the firmware of the -air quality monitoring project [luftdaten.info](https://luftdaten.info). -The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues -of the senseBox hardware, but I have no data on the exact time frames to verify. - -# Plot duration of boxes being active {.tabset} -While we are looking at `createdAt` and `updatedAt`, we can also extract the duration of activity -of each box, and look at metrics by exposure and grouptag once more: - -## ...by exposure -```{r exposure_duration, message=FALSE} -duration = boxes %>% - group_by(exposure) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = exposure, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') -``` - -The time of activity averages at only `r round(mean(duration$duration))` days, -though there are boxes with `r round(max(duration$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by grouptag -```{r grouptag_duration, message=FALSE} -duration = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 8 or more members - filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = grouptag, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -duration %>% - summarize( - duration_avg = round(mean(duration)), - duration_min = round(min(duration)), - duration_max = round(max(duration)), - oldest_box = round(max(difftime(now(), createdAt, units='days'))) - ) %>% - arrange(desc(duration_avg)) -``` - -The time of activity averages at only `r round(mean(duration$duration))` days, -though there are boxes with `r round(max(duration$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by year of registration -This is less useful, as older boxes are active for a longer time by definition. -If you have an idea how to compensate for that, please send a [Pull Request][PR]! - -```{r year_duration, message=FALSE} -# NOTE: boxes older than 2016 missing due to missing updatedAt in database -duration = boxes %>% - mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>% - group_by(year) %>% - filter(!is.na(updatedAt)) %>% - mutate(duration = difftime(updatedAt, createdAt, units='days')) - -ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') -``` - -# More Visualisations -Other visualisations come to mind, and are left as an exercise to the reader. -If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. - -* growth by phenomenon -* growth by location -> (interactive) map -* set inactive rate in relation to total box count -* filter timespans with big dips in growth rate, and extrapolate the amount of - senseBoxes that could be on the platform today, assuming there were no production issues ;) - -[PR]: https://github.com/sensebox/opensensmapr/pulls diff --git a/inst/doc/osem-history.html b/inst/doc/osem-history.html deleted file mode 100644 index af507d9..0000000 --- a/inst/doc/osem-history.html +++ /dev/null @@ -1,501 +0,0 @@ - - - - - - - - - - - - - - - - -Visualising the History of openSenseMap.org - - - - - - - - - - - - - - - - - -

Visualising the History of openSenseMap.org

-

Norwin Roosen

-

2018-05-26

- - -
- -
- -
-

This vignette serves as an example on data wrangling & visualization with opensensmapr, dplyr and ggplot2.

-
-
# required packages:
-library(opensensmapr) # data download
-library(dplyr)        # data wrangling
-library(ggplot2)      # plotting
-library(lubridate)    # date arithmetic
-library(zoo)          # rollmean()
-

openSenseMap.org has grown quite a bit in the last years; it would be interesting to see how we got to the current 1781 sensor stations, split up by various attributes of the boxes.

-

While opensensmapr provides extensive methods of filtering boxes by attributes on the server, we do the filtering within R to save time and gain flexibility. So the first step is to retrieve all the boxes:

-
# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes = osem_boxes()
-
-

Plot count of boxes by time

-

By looking at the createdAt attribute of each box we know the exact time a box was registered. With this approach we have no information about boxes that were deleted in the meantime, but that’s okay for now.

-
-

…and exposure

-
exposure_counts = boxes %>%
-  group_by(exposure) %>%
-  mutate(count = row_number(createdAt))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
-  geom_line() +
-  scale_colour_manual(values = exposure_colors) +
-  xlab('Registration Date') + ylab('senseBox count')
-

-

Outdoor boxes are growing fast! We can also see the introduction of mobile sensor “stations” in 2017. While mobile boxes are still few, we can expect a quick rise in 2018 once the new senseBox MCU with GPS support is released.

-

Let’s have a quick summary:

-
exposure_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
exposureoldestnewestcount
outdoor2015-02-18 16:53:412018-05-26 08:39:121416
indoor2015-02-08 17:36:402018-05-26 10:29:27290
mobile2017-05-24 08:16:362018-05-24 07:08:3255
unknown2014-05-28 15:36:142016-06-25 15:11:1120
-
-
-
-

…and grouptag

-

We can try to find out where the increases in growth came from, by analysing the box count by grouptag.

-

Caveats: Only a small subset of boxes has a grouptag, and we should assume that these groups are actually bigger. Also, we can see that grouptag naming is inconsistent (Luftdaten, luftdaten.info, …)

-
grouptag_counts = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
-  mutate(count = row_number(createdAt))
-
-# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
-  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
-  factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = createdAt, y = count, colour = grouptag)) +
-  geom_line(aes(group = grouptag)) +
-  xlab('Registration Date') + ylab('senseBox count')
-

-
grouptag_counts %>%
-  summarise(
-    oldest = min(createdAt),
-    newest = max(createdAt),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
grouptagoldestnewestcount
Luftdaten2017-03-14 17:01:162018-05-21 02:20:50109
ifgi2016-06-17 08:04:542018-05-15 10:27:0235
MakeLight2015-02-18 16:53:412018-02-02 13:50:2115
Bad_Hersfeld2017-07-18 13:32:032018-03-22 09:10:0713
luftdaten.info2017-05-01 10:15:442018-05-17 11:47:2112
dwih-sp2016-08-09 08:06:022016-11-23 10:16:0411
Che Aria Tira?2018-03-11 10:50:422018-03-11 23:11:2010
Luftdaten.info2017-04-03 14:10:202018-04-16 16:31:2410
Feinstaub2017-04-08 06:38:252018-03-29 17:27:559
PGKN2018-04-08 07:01:572018-04-27 18:38:519
Raumanmeri2017-03-13 11:35:392017-04-27 05:36:209
Sofia2017-04-11 04:40:112018-03-15 13:26:569
IKG2017-03-21 19:02:112017-12-18 14:30:218
-
-
-
-
-

Plot rate of growth and inactivity per week

-

First we group the boxes by createdAt into bins of one week:

-
bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
-  mutate(week = cut(as.Date(createdAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'registered')
-

We can do the same for updatedAt, which informs us about the last change to a box, including uploaded measurements. This method of determining inactive boxes is fairly inaccurate and should be considered an approximation, because we have no information about intermediate inactive phases. Also deleted boxes would probably have a big impact here.

-
inactive = boxes %>%
-  # remove boxes that were updated in the last two days,
-  # b/c any box becomes inactive at some point by definition of updatedAt
-  filter(updatedAt < now() - days(2)) %>%
-  mutate(week = cut(as.Date(updatedAt), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'inactive')
-

Now we can combine both datasets for plotting:

-
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
-  xlab('Time') + ylab(paste('rate per ', bins)) +
-  scale_x_date(date_breaks="years", date_labels="%Y") +
-  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
-  geom_point(aes(y = count), size = 0.5) +
-  # moving average, make first and last value NA (to ensure identical length of vectors)
-  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-

-

We see a sudden rise in early 2017, which lines up with the fast growing grouptag Luftdaten. This was enabled by an integration of openSenseMap.org into the firmware of the air quality monitoring project luftdaten.info. The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues of the senseBox hardware, but I have no data on the exact time frames to verify.

-
-
-

Plot duration of boxes being active

-

While we are looking at createdAt and updatedAt, we can also extract the duration of activity of each box, and look at metrics by exposure and grouptag once more:

-
-

…by exposure

-
duration = boxes %>%
-  group_by(exposure) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = exposure, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-

-

The time of activity averages at only 152 days, though there are boxes with 759 days of activity, spanning a large chunk of openSenseMap’s existence.

-
-
-

…by grouptag

-
duration = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 8 or more members
-  filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-  
-ggplot(duration, aes(x = grouptag, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-

-
duration %>%
-  summarize(
-    duration_avg = round(mean(duration)),
-    duration_min = round(min(duration)),
-    duration_max = round(max(duration)),
-    oldest_box = round(max(difftime(now(), createdAt, units='days')))
-  ) %>%
-  arrange(desc(duration_avg))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
grouptagduration_avgduration_minduration_maxoldest_box
dwih-sp627 days549 days655 days655 days
Feinstaub219 days4 days413 days413 days
ifgi207 days0 days622 days708 days
Sofia200 days15 days410 days410 days
Bad_Hersfeld197 days65 days312 days312 days
Luftdaten187 days0 days424 days438 days
luftdaten.info183 days9 days360 days390 days
IKG163 days70 days260 days431 days
Luftdaten.info86 days5 days376 days418 days
Che Aria Tira?75 days71 days76 days76 days
Raumanmeri45 days7 days318 days439 days
PGKN35 days29 days48 days48 days
-
-

The time of activity averages at only 191 days, though there are boxes with 655 days of activity, spanning a large chunk of openSenseMap’s existence.

-
-
-

…by year of registration

-

This is less useful, as older boxes are active for a longer time by definition. If you have an idea how to compensate for that, please send a Pull Request!

-
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
-  mutate(year = cut(as.Date(createdAt), breaks = 'year')) %>%
-  group_by(year) %>%
-  filter(!is.na(updatedAt)) %>%
-  mutate(duration = difftime(updatedAt, createdAt, units='days'))
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-

-
-
-
-

More Visualisations

-

Other visualisations come to mind, and are left as an exercise to the reader. If you implemented some, feel free to add them to this vignette via a Pull Request.

- -
- - - - - - - - diff --git a/inst/doc/osem-history_revised.Rmd b/inst/doc/osem-history_revised.Rmd deleted file mode 100644 index 1820d6c..0000000 --- a/inst/doc/osem-history_revised.Rmd +++ /dev/null @@ -1,302 +0,0 @@ ---- -title: "Visualising the Develpment of openSenseMap.org in 2022" -author: "Jan Stenkamp" -date: '`r Sys.Date()`' -output: - html_document: - code_folding: hide - df_print: kable - theme: lumen - toc: yes - toc_float: yes - rmarkdown::html_vignette: - df_print: kable - fig_height: 5 - fig_width: 7 - toc: yes -vignette: > - %\VignetteIndexEntry{Visualising the History of openSenseMap.org} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} ---- - -> This vignette serves as an example on data wrangling & visualization with -`opensensmapr`, `dplyr` and `ggplot2`. - -```{r setup, results='hide', message=FALSE, warning=FALSE} -# required packages: -# library(opensensmapr) # data download -library(devtools) -load_all(".") -library(dplyr) # data wrangling -library(ggplot2) # plotting -library(lubridate) # date arithmetic -library(zoo) # rollmean() -``` - -openSenseMap.org has grown quite a bit in the last years; it would be interesting -to see how we got to the current `r osem_counts()$boxes` sensor stations, -split up by various attributes of the boxes. - -While `opensensmapr` provides extensive methods of filtering boxes by attributes -on the server, we do the filtering within R to save time and gain flexibility. - - -So the first step is to retrieve *all the boxes*. - -```{r download, results='hide', message=FALSE, warning=FALSE} -# if you want to see results for a specific subset of boxes, -# just specify a filter such as grouptag='ifgi' here -boxes_all = osem_boxes() -boxes = boxes_all -``` -# Introduction -In the following we just want to have a look at the boxes created in 2022, so we filter for them. - -```{r} -boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31") -summary(boxes) -> summary.data.frame -``` - - - - - - - -Another feature of interest is the spatial distribution of the boxes: `plot()` -can help us out here. This function requires a bunch of optional dependencies though. - -```{r message=F, warning=F} -if (!require('maps')) install.packages('maps') -if (!require('maptools')) install.packages('maptools') -if (!require('rgeos')) install.packages('rgeos') - -plot(boxes) -``` - -But what do these sensor stations actually measure? Lets find out. -`osem_phenomena()` gives us a named list of of the counts of each observed -phenomenon for the given set of sensor stations: - -```{r} -phenoms = osem_phenomena(boxes) -str(phenoms) -``` - -Thats quite some noise there, with many phenomena being measured by a single -sensor only, or many duplicated phenomena due to slightly different spellings. -We should clean that up, but for now let's just filter out the noise and find -those phenomena with high sensor numbers: - -```{r} -phenoms[phenoms > 50] -``` - - -# Plot count of boxes by time {.tabset} -By looking at the `createdAt` attribute of each box we know the exact time a box -was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date. - -With this approach we have no information about boxes that were deleted in the -meantime, but that's okay for now. - -## ...and exposure -```{r exposure_counts, message=FALSE} -exposure_counts = boxes %>% - group_by(exposure) %>% - mutate(count = row_number(locationtimestamp)) - -exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') -ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) + - geom_line() + - scale_colour_manual(values = exposure_colors) + - xlab('Registration Date') + ylab('senseBox count') -``` - -Outdoor boxes are growing *fast*! -We can also see the introduction of `mobile` sensor "stations" in 2017. - -Let's have a quick summary: -```{r exposure_summary} -exposure_counts %>% - summarise( - oldest = min(locationtimestamp), - newest = max(locationtimestamp), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -## ...and grouptag -We can try to find out where the increases in growth came from, by analysing the -box count by grouptag. - -Caveats: Only a small subset of boxes has a grouptag, and we should assume -that these groups are actually bigger. Also, we can see that grouptag naming is -inconsistent (`Luftdaten`, `luftdaten.info`, ...) - -```{r grouptag_counts, message=FALSE} -grouptag_counts = boxes %>% - group_by(grouptag) %>% - # only include grouptags with 15 or more members - filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>% - mutate(count = row_number(locationtimestamp)) - -# helper for sorting the grouptags by boxcount -sortLvls = function(oldFactor, ascending = TRUE) { - lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() - factor(oldFactor, levels = lvls) -} -grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) - -ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) + - geom_line(aes(group = grouptag)) + - xlab('Registration Date') + ylab('senseBox count') -``` - -```{r grouptag_summary} -grouptag_counts %>% - summarise( - oldest = min(locationtimestamp), - newest = max(locationtimestamp), - count = max(count) - ) %>% - arrange(desc(count)) -``` - -# Plot rate of growth and inactivity per week -First we group the boxes by `locationtimestamp` into bins of one week: -```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} -bins = 'week' -mvavg_bins = 6 - -growth = boxes %>% - mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'registered') -``` - -We can do the same for `updatedAt`, which informs us about the last change to -a box, including uploaded measurements. As a lot of boxes were "updated" by the database -migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` -attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes -boxes which were created but never made a measurement. - -This method of determining inactive boxes is fairly inaccurate and should be -considered an approximation, because we have no information about intermediate -inactive phases. -Also deleted boxes would probably have a big impact here. -```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} -inactive = boxes %>% - # remove boxes that were updated in the last two days, - # b/c any box becomes inactive at some point by definition of updatedAt - filter(lastMeasurement < now() - days(2)) %>% - mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>% - filter(as.Date(week) > as.Date("2021-12-31")) %>% - group_by(week) %>% - summarize(count = length(week)) %>% - mutate(event = 'inactive') -``` - -Now we can combine both datasets for plotting: -```{r growthrate, warning=FALSE, message=FALSE, results='hide'} -boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) - -ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + - xlab('Time') + ylab(paste('rate per ', bins)) + - scale_x_date(date_breaks="years", date_labels="%Y") + - scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + - geom_point(aes(y = count), size = 0.5) + - # moving average, make first and last value NA (to ensure identical length of vectors) - geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) -``` - -And see in which weeks the most boxes become (in)active: -```{r table_mostregistrations} -boxes_by_date %>% - filter(count > 50) %>% - arrange(desc(count)) -``` - -# Plot duration of boxes being active {.tabset} -While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity -of each box, and look at metrics by exposure and grouptag once more: - -## ...by exposure -```{r exposure_duration, message=FALSE} -durations = boxes %>% - group_by(exposure) %>% - filter(!is.na(lastMeasurement)) %>% - mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% - filter(duration >= 0) - -ggplot(durations, aes(x = exposure, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') -``` - -The time of activity averages at only `r round(mean(durations$duration))` days, -though there are boxes with `r round(max(durations$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by grouptag -```{r grouptag_duration, message=FALSE} -durations = boxes %>% - filter(!is.na(lastMeasurement)) %>% - group_by(grouptag) %>% - # only include grouptags with 20 or more members - filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>% - mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% - filter(duration >= 0) - -ggplot(durations, aes(x = grouptag, y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') - -durations %>% - summarize( - duration_avg = round(mean(duration)), - duration_min = round(min(duration)), - duration_max = round(max(duration)), - oldest_box = round(max(difftime(now(), locationtimestamp, units='days'))) - ) %>% - arrange(desc(duration_avg)) -``` - -The time of activity averages at only `r round(mean(durations$duration))` days, -though there are boxes with `r round(max(durations$duration))` days of activity, -spanning a large chunk of openSenseMap's existence. - -## ...by year of registration -This is less useful, as older boxes are active for a longer time by definition. -If you have an idea how to compensate for that, please send a [Pull Request][PR]! - -```{r year_duration, message=FALSE} -# NOTE: boxes older than 2016 missing due to missing updatedAt in database -duration = boxes %>% - mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>% - group_by(year) %>% - filter(!is.na(lastMeasurement)) %>% - mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% - filter(duration >= 0) - -ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + - geom_boxplot() + - coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') -``` - -# More Visualisations -Other visualisations come to mind, and are left as an exercise to the reader. -If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. - -* growth by phenomenon -* growth by location -> (interactive) map -* set inactive rate in relation to total box count -* filter timespans with big dips in growth rate, and extrapolate the amount of - senseBoxes that could be on the platform today, assuming there were no production issues ;) - -[PR]: https://github.com/sensebox/opensensmapr/pulls - - diff --git a/inst/doc/osem-history_revised.html b/inst/doc/osem-history_revised.html deleted file mode 100644 index 2fd728f..0000000 --- a/inst/doc/osem-history_revised.html +++ /dev/null @@ -1,2493 +0,0 @@ - - - - - - - - - - - - - - - -Visualising the Develpment of openSenseMap.org in 2022 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
-

This vignette serves as an example on data wrangling & -visualization with opensensmapr, dplyr and -ggplot2.

-
-
# required packages:
-#library(opensensmapr) # data download
-library(devtools)
-load_all(".")
-library(dplyr)        # data wrangling
-library(ggplot2)      # plotting
-library(lubridate)    # date arithmetic
-library(zoo)          # rollmean()
-

openSenseMap.org has grown quite a bit in the last years; it would be -interesting to see how we got to the current 11307 sensor stations, -split up by various attributes of the boxes.

-

While opensensmapr provides extensive methods of -filtering boxes by attributes on the server, we do the filtering within -R to save time and gain flexibility.

-

So the first step is to retrieve all the boxes.

-
# if you want to see results for a specific subset of boxes,
-# just specify a filter such as grouptag='ifgi' here
-boxes_all = osem_boxes()
-boxes = boxes_all
-
-

Introduction

-

In the following we just want to have a look at the boxes created in -2022, so we filter for them.

-
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
-summary(boxes) -> summary.data.frame
-
## boxes total: 2132
-## 
-## boxes by exposure:
-##  indoor  mobile outdoor unknown 
-##     532     201    1398       1 
-## 
-## boxes by model:
-##                   custom          hackair_home_v2             homeEthernet 
-##                      939                        5                        5 
-##    homeEthernetFeinstaub           homeV2Ethernet  homeV2EthernetFeinstaub 
-##                        2                        5                        5 
-##               homeV2Lora               homeV2Wifi      homeV2WifiFeinstaub 
-##                       62                      226                      116 
-##                 homeWifi        homeWifiFeinstaub        luftdaten_pms1003 
-##                       14                       17                        0 
-## luftdaten_pms1003_bme280        luftdaten_pms3003 luftdaten_pms3003_bme280 
-##                        1                        0                        0 
-##        luftdaten_pms5003 luftdaten_pms5003_bme280        luftdaten_pms7003 
-##                        2                        8                        0 
-## luftdaten_pms7003_bme280         luftdaten_sds011  luftdaten_sds011_bme280 
-##                        8                       29                      465 
-##  luftdaten_sds011_bmp180   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
-##                       27                        7                      189 
-## 
-## $last_measurement_within
-##    1h    1d   30d  365d never 
-##   731   765   874  1571   522 
-## 
-## oldest box: 2020-02-29 23:00:31 (Kirchardt 1)
-## newest box: 2022-12-30 09:19:46 (Balkon)
-## 
-## sensors per box:
-##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   1.000   3.000   5.000   4.959   6.000  29.000
- - - - - -

Another feature of interest is the spatial distribution of the boxes: -plot() can help us out here. This function requires a bunch -of optional dependencies though.

-
if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(boxes)
-

-

But what do these sensor stations actually measure? Lets find out. -osem_phenomena() gives us a named list of of the counts of -each observed phenomenon for the given set of sensor stations:

-
phenoms = osem_phenomena(boxes)
-str(phenoms)
-
## List of 969
-##  $ Temperatur                             : int 1543
-##  $ rel. Luftfeuchte                       : int 1306
-##  $ PM10                                   : int 1080
-##  $ PM2.5                                  : int 1079
-##  $ Luftdruck                              : int 1004
-##  $ Beleuchtungsstärke                     : int 290
-##  $ UV-Intensität                          : int 290
-##  $ VOC                                    : int 224
-##  $ Lufttemperatur                         : int 208
-##  $ CO₂                                    : int 179
-##  $ Bodenfeuchte                           : int 173
-##  $ Temperature                            : int 167
-##  $ Lautstärke                             : int 136
-##  $ Luftfeuchte                            : int 134
-##  $ Humidity                               : int 124
-##  $ atm. Luftdruck                         : int 114
-##  $ Kalibrierungswert                      : int 108
-##  $ CO2eq                                  : int 107
-##  $ IAQ                                    : int 107
-##  $ rel. Luftfeuchte SCD30                 : int 107
-##  $ Temperatur SCD30                       : int 107
-##  $ Pressure                               : int 98
-##  $ Luftfeuchtigkeit                       : int 61
-##  $ Bodentemperatur                        : int 58
-##  $ PM01                                   : int 58
-##  $ Windgeschwindigkeit                    : int 46
-##  $ Feinstaub PM10                         : int 33
-##  $ Feinstaub PM2.5                        : int 24
-##  $ Batterie                               : int 22
-##  $ Feinstaub PM1.0                        : int 20
-##  $ Taupunkt                               : int 19
-##  $ Windrichtung                           : int 19
-##  $ rel. Luftfeuchte (HECA)                : int 16
-##  $ Temperatur (HECA)                      : int 15
-##  $ Temperatura                            : int 15
-##  $ Durchschnitt Umgebungslautstärke       : int 14
-##  $ Helligkeit                             : int 14
-##  $ Latitude                               : int 14
-##  $ Longtitude                             : int 14
-##  $ Minimum Umgebungslautstärke            : int 14
-##  $ RSSI                                   : int 14
-##  $ UV-Index                               : int 11
-##  $ CO2                                    : int 10
-##  $ PM1                                    : int 10
-##  $ rel-. Luftfeuchte                      : int 10
-##  $ temperature                            : int 10
-##  $ Abstand nach links                     : int 9
-##  $ Abstand nach rechts                    : int 9
-##  $ Beschleunigung X-Achse                 : int 9
-##  $ Beschleunigung Y-Achse                 : int 9
-##  $ Beschleunigung Z-Achse                 : int 9
-##  $ Feinstaub PM25                         : int 9
-##  $ gefühlte Temperatur                    : int 9
-##  $ Luftdruck absolut                      : int 9
-##  $ Luftdruck relativ                      : int 9
-##  $ Regenrate                              : int 9
-##  $ Sonnenstrahlung                        : int 9
-##  $ Geschwindigkeit                        : int 8
-##  $ humidity                               : int 8
-##  $ Bodenfeuchtigkeit                      : int 7
-##  $ Baterie                                : int 6
-##  $ Bodenfeuchte 10cm                      : int 6
-##  $ Bodenfeuchte 30cm                      : int 6
-##  $ Bodentemperatur 10cm                   : int 6
-##  $ Bodentemperatur 30cm                   : int 6
-##  $ Lumina                                 : int 6
-##  $ Taupunktdifferenz (Spread)             : int 6
-##  $ Umiditate                              : int 6
-##  $ UV                                     : int 6
-##  $ Air pressure                           : int 5
-##  $ Battery                                : int 5
-##  $ Höhe (barometrisch)                    : int 5
-##  $ Pegel                                  : int 5
-##  $ PM4                                    : int 5
-##  $ Prezenta-foc                           : int 5
-##  $ Regenmenge                             : int 5
-##  $ W-LAN                                  : int 5
-##  $ absolute Luftfeuchtigkeit              : int 4
-##  $ Bodenfeuchte 1                         : int 4
-##  $ Bodenfeuchte 2                         : int 4
-##  $ Bodentemperatur 1                      : int 4
-##  $ Bodentemperatur 2                      : int 4
-##  $ CO2 Konzentration                      : int 4
-##  $ Gesamt Gewicht                         : int 4
-##  $ PM 2.5                                 : int 4
-##  $ Température                            : int 4
-##  $ Wilgotność                             : int 4
-##  $ Windspeed                              : int 4
-##  $ air pressure                           : int 3
-##  $ Battery Voltage                        : int 3
-##  $ Befehl                                 : int 3
-##  $ Bodenfeuchte 50cm                      : int 3
-##  $ Bodentemperatur 60cm                   : int 3
-##  $ Ciśnienie                              : int 3
-##  $ Feinstaub                              : int 3
-##  $ Höhe                                   : int 3
-##  $ Humedad                                : int 3
-##  $ Laufzeit                               : int 3
-##  $ LuftfeuchteBME                         : int 3
-##   [list output truncated]
-

Thats quite some noise there, with many phenomena being measured by a -single sensor only, or many duplicated phenomena due to slightly -different spellings. We should clean that up, but for now let’s just -filter out the noise and find those phenomena with high sensor -numbers:

-
phenoms[phenoms > 50]
-
## $Temperatur
-## [1] 1543
-## 
-## $`rel. Luftfeuchte`
-## [1] 1306
-## 
-## $PM10
-## [1] 1080
-## 
-## $PM2.5
-## [1] 1079
-## 
-## $Luftdruck
-## [1] 1004
-## 
-## $Beleuchtungsstärke
-## [1] 290
-## 
-## $`UV-Intensität`
-## [1] 290
-## 
-## $VOC
-## [1] 224
-## 
-## $Lufttemperatur
-## [1] 208
-## 
-## $`CO₂`
-## [1] 179
-## 
-## $Bodenfeuchte
-## [1] 173
-## 
-## $Temperature
-## [1] 167
-## 
-## $Lautstärke
-## [1] 136
-## 
-## $Luftfeuchte
-## [1] 134
-## 
-## $Humidity
-## [1] 124
-## 
-## $`atm. Luftdruck`
-## [1] 114
-## 
-## $Kalibrierungswert
-## [1] 108
-## 
-## $CO2eq
-## [1] 107
-## 
-## $IAQ
-## [1] 107
-## 
-## $`rel. Luftfeuchte SCD30`
-## [1] 107
-## 
-## $`Temperatur SCD30`
-## [1] 107
-## 
-## $Pressure
-## [1] 98
-## 
-## $Luftfeuchtigkeit
-## [1] 61
-## 
-## $Bodentemperatur
-## [1] 58
-## 
-## $PM01
-## [1] 58
-
-
-

Plot count of boxes by time

-

By looking at the createdAt attribute of each box we -know the exact time a box was registered. Because of some database -migration issues the createdAt values are mostly wrong -(~80% of boxes created 2022-03-30), so we are using the -timestamp attribute of the currentlocation -which should in most cases correspond to the creation date.

-

With this approach we have no information about boxes that were -deleted in the meantime, but that’s okay for now.

-
-

…and exposure

-
exposure_counts = boxes %>%
-  group_by(exposure) %>%
-  mutate(count = row_number(locationtimestamp))
-
-exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
-ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
-  geom_line() +
-  scale_colour_manual(values = exposure_colors) +
-  xlab('Registration Date') + ylab('senseBox count')
-

-

Outdoor boxes are growing fast! We can also see the -introduction of mobile sensor “stations” in 2017.

-

Let’s have a quick summary:

-
exposure_counts %>%
-  summarise(
-    oldest = min(locationtimestamp),
-    newest = max(locationtimestamp),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
exposureoldestnewestcount
outdoor2022-01-01 11:59:162022-12-30 09:19:461398
indoor2022-01-02 11:06:082022-12-23 20:46:45532
mobile2022-01-06 13:20:002022-12-21 21:35:16201
unknown2022-03-01 07:04:302022-03-01 07:04:301
-
-
-
-

…and grouptag

-

We can try to find out where the increases in growth came from, by -analysing the box count by grouptag.

-

Caveats: Only a small subset of boxes has a grouptag, and we should -assume that these groups are actually bigger. Also, we can see that -grouptag naming is inconsistent (Luftdaten, -luftdaten.info, …)

-
grouptag_counts = boxes %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 20 or more members
-  filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
-  mutate(count = row_number(locationtimestamp))
-
## Warning: There were 33 warnings in `filter()`.
-## The first warning was:
-## ℹ In argument: `length(grouptag) >= 15 && !is.na(grouptag) && grouptag != ""`.
-## ℹ In group 11: `grouptag = "321heiss"`.
-## Caused by warning in `length(grouptag) >= 15 && !is.na(grouptag)`:
-## ! 'length(x) = 91 > 1' in coercion to 'logical(1)'
-## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 32 remaining warnings.
-
# helper for sorting the grouptags by boxcount
-sortLvls = function(oldFactor, ascending = TRUE) {
-  lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
-  factor(oldFactor, levels = lvls)
-}
-grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
-
-ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
-  geom_line(aes(group = grouptag)) +
-  xlab('Registration Date') + ylab('senseBox count')
-

-
grouptag_counts %>%
-  summarise(
-    oldest = min(locationtimestamp),
-    newest = max(locationtimestamp),
-    count = max(count)
-  ) %>%
-  arrange(desc(count))
-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
grouptagoldestnewestcount
edu2022-01-02 11:06:082022-12-18 12:38:27130
HU Explorers2022-04-01 09:07:412022-12-14 10:11:34128
321heiss2022-07-09 01:29:372022-09-01 06:27:3591
Captographies2022-06-03 11:25:272022-11-16 13:26:3958
SUGUCS2022-11-30 15:25:322022-12-03 10:11:4123
SekSeeland2022-03-14 13:17:172022-03-22 20:23:5819
BurgerMeetnet2022-01-24 15:33:192022-05-10 21:22:3516
AGIN2022-11-28 17:33:122022-11-28 17:42:1815
BRGL2022-11-06 19:23:432022-11-06 22:08:3615
BRGW2022-11-02 10:28:522022-11-02 13:32:1215
Burgermeetnet2022-01-15 20:43:162022-02-11 17:59:0515
HTLJ2022-11-21 22:04:172022-11-21 22:05:4715
Mikroprojekt Mitmachklima2022-02-09 10:28:402022-08-23 13:14:1115
MSGB2022-11-14 09:08:572022-11-14 10:19:2415
MSHO2022-12-20 09:28:402022-12-20 10:01:3815
MSIN2022-11-21 17:02:392022-11-21 23:06:2215
-
-
-
-
-

Plot rate of growth and inactivity per week

-

First we group the boxes by locationtimestamp into bins -of one week:

-
bins = 'week'
-mvavg_bins = 6
-
-growth = boxes %>%
-  mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'registered')
-

We can do the same for updatedAt, which informs us about -the last change to a box, including uploaded measurements. As a lot of -boxes were “updated” by the database migration, many of them are updated -at 2022-03-30, so we try to use the lastMeasurement -attribute instead of updatedAt. This leads to fewer boxes -but also automatically excludes boxes which were created but never made -a measurement.

-

This method of determining inactive boxes is fairly inaccurate and -should be considered an approximation, because we have no information -about intermediate inactive phases. Also deleted boxes would probably -have a big impact here.

-
inactive = boxes %>%
-  # remove boxes that were updated in the last two days,
-  # b/c any box becomes inactive at some point by definition of updatedAt
-  filter(lastMeasurement < now() - days(2)) %>%
-  mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
-  filter(as.Date(week) > as.Date("2021-12-31")) %>%
-  group_by(week) %>%
-  summarize(count = length(week)) %>%
-  mutate(event = 'inactive')
-

Now we can combine both datasets for plotting:

-
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
-
-ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
-  xlab('Time') + ylab(paste('rate per ', bins)) +
-  scale_x_date(date_breaks="years", date_labels="%Y") +
-  scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
-  geom_point(aes(y = count), size = 0.5) +
-  # moving average, make first and last value NA (to ensure identical length of vectors)
-  geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
-

-

And see in which weeks the most boxes become (in)active:

-
boxes_by_date %>%
-  filter(count > 50) %>%
-  arrange(desc(count))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
weekcountevent
2022-11-2193registered
2022-06-0677registered
2022-08-2976registered
2022-10-3172registered
2022-11-1468registered
2022-11-2866registered
2022-08-2261registered
2022-02-2857registered
2022-12-1256registered
2022-08-2956inactive
2022-03-2154registered
2022-01-2451registered
2022-03-0751registered
-
-
-
-

Plot duration of boxes being active

-

While we are looking at locationtimestamp and -lastMeasurement, we can also extract the duration of -activity of each box, and look at metrics by exposure and grouptag once -more:

-
-

…by exposure

-
durations = boxes %>%
-  group_by(exposure) %>%
-  filter(!is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-
-ggplot(durations, aes(x = exposure, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-

-

The time of activity averages at only 130 days, though there are -boxes with 395 days of activity, spanning a large chunk of -openSenseMap’s existence.

-
-
-

…by grouptag

-
durations = boxes %>%
-  filter(!is.na(lastMeasurement)) %>%
-  group_by(grouptag) %>%
-  # only include grouptags with 20 or more members
-  filter(length(grouptag) >= 15 && !is.na(grouptag) && !is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-
## Warning: There were 21 warnings in `filter()`.
-## The first warning was:
-## ℹ In argument: `length(grouptag) >= 15 && !is.na(grouptag) && ...`.
-## ℹ In group 11: `grouptag = "321heiss"`.
-## Caused by warning in `length(grouptag) >= 15 && !is.na(grouptag)`:
-## ! 'length(x) = 81 > 1' in coercion to 'logical(1)'
-## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 20 remaining warnings.
-
ggplot(durations, aes(x = grouptag, y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days')
-

-
durations %>%
-  summarize(
-    duration_avg = round(mean(duration)),
-    duration_min = round(min(duration)),
-    duration_max = round(max(duration)),
-    oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
-  ) %>%
-  arrange(desc(duration_avg))
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
grouptagduration_avgduration_minduration_maxoldest_box
Burgermeetnet225 days0 days381 days381 days
BurgerMeetnet160 days0 days372 days372 days
Captographies109 days0 days238 days240 days
BRGL85 days81 days86 days86 days
edu83 days0 days385 days389 days
MSGB72 days39 days78 days78 days
HTLJ65 days30 days71 days71 days
MSHO41 days36 days42 days42 days
HU Explorers28 days0 days189 days305 days
321heiss0 days0 days0 days207 days
-
-

The time of activity averages at only 61 days, though there are boxes -with 385 days of activity, spanning a large chunk of openSenseMap’s -existence.

-
-
-

…by year of registration

-

This is less useful, as older boxes are active for a longer time by -definition. If you have an idea how to compensate for that, please send -a Pull -Request!

-
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
-duration = boxes %>%
-  mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
-  group_by(year) %>%
-  filter(!is.na(lastMeasurement)) %>%
-  mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
-  filter(duration >= 0)
-
-ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
-  geom_boxplot() +
-  coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
-

-
-
-
-

More Visualisations

-

Other visualisations come to mind, and are left as an exercise to the -reader. If you implemented some, feel free to add them to this vignette -via a Pull -Request.

-
    -
  • growth by phenomenon
  • -
  • growth by location -> (interactive) map
  • -
  • set inactive rate in relation to total box count
  • -
  • filter timespans with big dips in growth rate, and extrapolate the -amount of senseBoxes that could be on the platform today, assuming there -were no production issues ;)
  • -
-
-
-

Issues:

- -
- - - -
-
- -
- - - - - - - - - - - - - - - - - diff --git a/inst/doc/osem-intro.R b/inst/doc/osem-intro.R deleted file mode 100644 index 575f473..0000000 --- a/inst/doc/osem-intro.R +++ /dev/null @@ -1,73 +0,0 @@ -## ----setup, include=FALSE------------------------------------------------ -knitr::opts_chunk$set(echo = TRUE) - -## ----results = F--------------------------------------------------------- -library(magrittr) -library(opensensmapr) - -all_sensors = osem_boxes() - -## ------------------------------------------------------------------------ -summary(all_sensors) - -## ----message=F, warning=F------------------------------------------------ -if (!require('maps')) install.packages('maps') -if (!require('maptools')) install.packages('maptools') -if (!require('rgeos')) install.packages('rgeos') - -plot(all_sensors) - -## ------------------------------------------------------------------------ -phenoms = osem_phenomena(all_sensors) -str(phenoms) - -## ------------------------------------------------------------------------ -phenoms[phenoms > 20] - -## ----results = F--------------------------------------------------------- -pm25_sensors = osem_boxes( - exposure = 'outdoor', - date = Sys.time(), # ±4 hours - phenomenon = 'PM2.5' -) - -## ------------------------------------------------------------------------ -summary(pm25_sensors) -plot(pm25_sensors) - -## ------------------------------------------------------------------------ -library(sf) -library(units) -library(lubridate) -library(dplyr) - -# construct a bounding box: 12 kilometers around Berlin -berlin = st_point(c(13.4034, 52.5120)) %>% - st_sfc(crs = 4326) %>% - st_transform(3857) %>% # allow setting a buffer in meters - st_buffer(set_units(12, km)) %>% - st_transform(4326) %>% # the opensensemap expects WGS 84 - st_bbox() - -## ----results = F--------------------------------------------------------- -pm25 = osem_measurements( - berlin, - phenomenon = 'PM2.5', - from = now() - days(20), # defaults to 2 days - to = now() -) - -plot(pm25) - -## ------------------------------------------------------------------------ -outliers = filter(pm25, value > 100)$sensorId -bad_sensors = outliers[, drop = T] %>% levels() - -pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) - -## ------------------------------------------------------------------------ -st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) - -## ------------------------------------------------------------------------ -pm25 %>% filter(invalid == FALSE) %>% plot() - diff --git a/inst/doc/osem-intro.Rmd b/inst/doc/osem-intro.Rmd deleted file mode 100644 index 7a2ad09..0000000 --- a/inst/doc/osem-intro.Rmd +++ /dev/null @@ -1,151 +0,0 @@ ---- -title: "Exploring the openSenseMap Dataset" -author: "Norwin Roosen" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - fig_margin: 0 - fig_width: 6 - fig_height: 4 -vignette: > - %\VignetteIndexEntry{Exploring the openSenseMap Dataset} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -This package provides data ingestion functions for almost any data stored on the -open data platform for environemental sensordata . -Its main goals are to provide means for: - -- big data analysis of the measurements stored on the platform -- sensor metadata analysis (sensor counts, spatial distribution, temporal trends) - -### Exploring the dataset -Before we look at actual observations, lets get a grasp of the openSenseMap -datasets' structure. - -```{r results = F} -library(magrittr) -library(opensensmapr) - -all_sensors = osem_boxes() -``` -```{r} -summary(all_sensors) -``` - -This gives a good overview already: As of writing this, there are more than 700 -sensor stations, of which ~50% are currently running. Most of them are placed -outdoors and have around 5 sensors each. -The oldest station is from May 2014, while the latest station was registered a -couple of minutes ago. - -Another feature of interest is the spatial distribution of the boxes: `plot()` -can help us out here. This function requires a bunch of optional dependencies though. - -```{r message=F, warning=F} -if (!require('maps')) install.packages('maps') -if (!require('maptools')) install.packages('maptools') -if (!require('rgeos')) install.packages('rgeos') - -plot(all_sensors) -``` - -It seems we have to reduce our area of interest to Germany. - -But what do these sensor stations actually measure? Lets find out. -`osem_phenomena()` gives us a named list of of the counts of each observed -phenomenon for the given set of sensor stations: - -```{r} -phenoms = osem_phenomena(all_sensors) -str(phenoms) -``` - -Thats quite some noise there, with many phenomena being measured by a single -sensor only, or many duplicated phenomena due to slightly different spellings. -We should clean that up, but for now let's just filter out the noise and find -those phenomena with high sensor numbers: - -```{r} -phenoms[phenoms > 20] -``` - -Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more -interesting to analyze though. -We should check how many sensor stations provide useful data: We want only those -boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting -measurements: - -```{r results = F} -pm25_sensors = osem_boxes( - exposure = 'outdoor', - date = Sys.time(), # ±4 hours - phenomenon = 'PM2.5' -) -``` -```{r} -summary(pm25_sensors) -plot(pm25_sensors) -``` - -Thats still more than 200 measuring stations, we can work with that. - -### Analyzing sensor data -Having analyzed the available data sources, let's finally get some measurements. -We could call `osem_measurements(pm25_sensors)` now, however we are focussing on -a restricted area of interest, the city of Berlin. -Luckily we can get the measurements filtered by a bounding box: - -```{r} -library(sf) -library(units) -library(lubridate) -library(dplyr) - -# construct a bounding box: 12 kilometers around Berlin -berlin = st_point(c(13.4034, 52.5120)) %>% - st_sfc(crs = 4326) %>% - st_transform(3857) %>% # allow setting a buffer in meters - st_buffer(set_units(12, km)) %>% - st_transform(4326) %>% # the opensensemap expects WGS 84 - st_bbox() -``` -```{r results = F} -pm25 = osem_measurements( - berlin, - phenomenon = 'PM2.5', - from = now() - days(20), # defaults to 2 days - to = now() -) - -plot(pm25) -``` - -Now we can get started with actual spatiotemporal data analysis. -First, lets mask the seemingly uncalibrated sensors: - -```{r} -outliers = filter(pm25, value > 100)$sensorId -bad_sensors = outliers[, drop = T] %>% levels() - -pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) -``` - -Then plot the measuring locations, flagging the outliers: - -```{r} -st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) -``` - -Removing these sensors yields a nicer time series plot: - -```{r} -pm25 %>% filter(invalid == FALSE) %>% plot() -``` - -Further analysis: comparison with LANUV data `TODO` diff --git a/inst/doc/osem-intro.html b/inst/doc/osem-intro.html deleted file mode 100644 index 5835840..0000000 --- a/inst/doc/osem-intro.html +++ /dev/null @@ -1,362 +0,0 @@ - - - - - - - - - - - - - - - - -Exploring the openSenseMap Dataset - - - - - - - - - - - - - - - - - -

Exploring the openSenseMap Dataset

-

Norwin Roosen

-

2018-05-26

- - - -

This package provides data ingestion functions for almost any data stored on the open data platform for environemental sensordata https://opensensemap.org. Its main goals are to provide means for:

- -
-

Exploring the dataset

-

Before we look at actual observations, lets get a grasp of the openSenseMap datasets’ structure.

-
library(magrittr)
-library(opensensmapr)
-
-all_sensors = osem_boxes()
-
summary(all_sensors)
-
## boxes total: 1781
-## 
-## boxes by exposure:
-##  indoor  mobile outdoor unknown 
-##     290      55    1416      20 
-## 
-## boxes by model:
-##                   custom             homeEthernet    homeEthernetFeinstaub 
-##                      336                       92                       49 
-##                 homeWifi        homeWifiFeinstaub        luftdaten_pms1003 
-##                      193                      144                        1 
-## luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 luftdaten_pms7003_bme280 
-##                        1                        5                        2 
-##         luftdaten_sds011  luftdaten_sds011_bme280  luftdaten_sds011_bmp180 
-##                       57                      197                       19 
-##   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
-##                       46                      639 
-## 
-## $last_measurement_within
-##    1h    1d   30d  365d never 
-##   929   954  1091  1428   235 
-## 
-## oldest box: 2014-05-28 15:36:14 (CALIMERO)
-## newest box: 2018-05-26 10:29:27 (UOS_DDI)
-## 
-## sensors per box:
-##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##     1.0     4.0     4.0     4.6     5.0    33.0
-

This gives a good overview already: As of writing this, there are more than 700 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago.

-

Another feature of interest is the spatial distribution of the boxes: plot() can help us out here. This function requires a bunch of optional dependencies though.

-
if (!require('maps'))     install.packages('maps')
-if (!require('maptools')) install.packages('maptools')
-if (!require('rgeos'))    install.packages('rgeos')
-
-plot(all_sensors)
-

-

It seems we have to reduce our area of interest to Germany.

-

But what do these sensor stations actually measure? Lets find out. osem_phenomena() gives us a named list of of the counts of each observed phenomenon for the given set of sensor stations:

-
phenoms = osem_phenomena(all_sensors)
-str(phenoms)
-
## List of 433
-##  $ Temperatur                                      : int 1608
-##  $ rel. Luftfeuchte                                : int 1422
-##  $ PM10                                            : int 1200
-##  $ PM2.5                                           : int 1198
-##  $ Luftdruck                                       : int 825
-##  $ Beleuchtungsstärke                              : int 481
-##  $ UV-Intensität                                   : int 472
-##  $ Luftfeuchtigkeit                                : int 84
-##  $ Temperature                                     : int 49
-##  $ Humidity                                        : int 42
-##  $ Helligkeit                                      : int 25
-##  $ Lautstärke                                      : int 21
-##  $ Schall                                          : int 20
-##  $ UV                                              : int 20
-##  $ Pressure                                        : int 19
-##  $ Licht                                           : int 18
-##  $ Luftfeuchte                                     : int 14
-##  $ Umgebungslautstärke                             : int 14
-##  $ Lämpötila                                       : int 13
-##  $ Ilmanpaine                                      : int 12
-##  $ Signal                                          : int 12
-##  $ Feinstaub PM10                                  : int 10
-##  $ Feinstaub PM2.5                                 : int 9
-##  $ Kosteus                                         : int 8
-##  $ Temperatur DHT22                                : int 8
-##  $ Valonmäärä                                      : int 8
-##  $ temperature                                     : int 8
-##  $ PM01                                            : int 7
-##  $ UV-säteily                                      : int 7
-##  $ Niederschlag                                    : int 6
-##  $ UV-Strahlung                                    : int 6
-##  $ Wind speed                                      : int 6
-##  $ Windgeschwindigkeit                             : int 6
-##  $ humidity                                        : int 6
-##  $ Ilmankosteus                                    : int 5
-##  $ Wassertemperatur                                : int 5
-##  $ Windrichtung                                    : int 5
-##  $ rel. Luftfeuchtigkeit                           : int 5
-##  $ Druck                                           : int 4
-##  $ Light                                           : int 4
-##  $ Temperature 1                                   : int 4
-##  $ UV Index                                        : int 4
-##  $ UV-Säteily                                      : int 4
-##  $ lautstärke                                      : int 4
-##  $ rel. Luftfeuchte 1                              : int 4
-##  $ rel. Luftfeuchte DHT22                          : int 4
-##  $ relative Luftfeuchtigkeit                       : int 4
-##  $ Air pressure                                    : int 3
-##  $ Batterie                                        : int 3
-##  $ Battery                                         : int 3
-##  $ DS18B20_Probe01                                 : int 3
-##  $ DS18B20_Probe02                                 : int 3
-##  $ DS18B20_Probe03                                 : int 3
-##  $ DS18B20_Probe04                                 : int 3
-##  $ DS18B20_Probe05                                 : int 3
-##  $ Licht (digital)                                 : int 3
-##  $ Luftdruck (BME280)                              : int 3
-##  $ PM 10                                           : int 3
-##  $ PM 2.5                                          : int 3
-##  $ Temp                                            : int 3
-##  $ Temperatur (BME280)                             : int 3
-##  $ Temperatur HDC1008                              : int 3
-##  $ Temperatura                                     : int 3
-##  $ Temperature 2                                   : int 3
-##  $ UV-Index                                        : int 3
-##  $ Valoisuus                                       : int 3
-##  $ Wind Gust                                       : int 3
-##  $ pressure                                        : int 3
-##  $ 1                                               : int 2
-##  $ 10                                              : int 2
-##  $ 2                                               : int 2
-##  $ 3                                               : int 2
-##  $ 4                                               : int 2
-##  $ 5                                               : int 2
-##  $ 6                                               : int 2
-##  $ 7                                               : int 2
-##  $ 8                                               : int 2
-##  $ 9                                               : int 2
-##  $ Air Pressure                                    : int 2
-##  $ Anderer                                         : int 2
-##  $ Battery voltage                                 : int 2
-##  $ CO2                                             : int 2
-##  $ Feuchte                                         : int 2
-##  $ Illuminance                                     : int 2
-##  $ Intensity                                       : int 2
-##  $ Leitfähigkeit                                   : int 2
-##  $ Lichtintensität                                 : int 2
-##  $ Luftdruck BMP180                                : int 2
-##  $ Luftfeuchte (BME280)                            : int 2
-##  $ Luftqualität                                    : int 2
-##  $ Lufttemperatur                                  : int 2
-##  $ PM25                                            : int 2
-##  $ Radioactivity                                   : int 2
-##  $ Radioaktivität                                  : int 2
-##  $ Regen                                           : int 2
-##  $ Relative Humidity                               : int 2
-##  $ Sound                                           : int 2
-##  $ Temperatur (DHT22)                              : int 2
-##  $ Temperatur BMP180                               : int 2
-##   [list output truncated]
-

Thats quite some noise there, with many phenomena being measured by a single sensor only, or many duplicated phenomena due to slightly different spellings. We should clean that up, but for now let’s just filter out the noise and find those phenomena with high sensor numbers:

-
phenoms[phenoms > 20]
-
## $Temperatur
-## [1] 1608
-## 
-## $`rel. Luftfeuchte`
-## [1] 1422
-## 
-## $PM10
-## [1] 1200
-## 
-## $PM2.5
-## [1] 1198
-## 
-## $Luftdruck
-## [1] 825
-## 
-## $Beleuchtungsstärke
-## [1] 481
-## 
-## $`UV-Intensität`
-## [1] 472
-## 
-## $Luftfeuchtigkeit
-## [1] 84
-## 
-## $Temperature
-## [1] 49
-## 
-## $Humidity
-## [1] 42
-## 
-## $Helligkeit
-## [1] 25
-## 
-## $Lautstärke
-## [1] 21
-

Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more interesting to analyze though. We should check how many sensor stations provide useful data: We want only those boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting measurements:

-
pm25_sensors = osem_boxes(
-  exposure = 'outdoor',
-  date = Sys.time(), # ±4 hours
-  phenomenon = 'PM2.5'
-)
-
summary(pm25_sensors)
-
## boxes total: 791
-## 
-## boxes by exposure:
-## outdoor 
-##     791 
-## 
-## boxes by model:
-##                   custom    homeEthernetFeinstaub                 homeWifi 
-##                       29                       37                        6 
-##        homeWifiFeinstaub luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 
-##                       57                        1                        1 
-## luftdaten_pms7003_bme280         luftdaten_sds011  luftdaten_sds011_bme280 
-##                        2                       32                      137 
-##  luftdaten_sds011_bmp180   luftdaten_sds011_dht11   luftdaten_sds011_dht22 
-##                       14                       32                      443 
-## 
-## $last_measurement_within
-##    1h    1d   30d  365d never 
-##   771   780   784   789     2 
-## 
-## oldest box: 2016-06-02 12:09:47 (BalkonBox Mindener Str.)
-## newest box: 2018-05-24 20:29:50 (Stadthalle)
-## 
-## sensors per box:
-##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   2.000   4.000   4.000   4.617   5.000  12.000
-
plot(pm25_sensors)
-

-

Thats still more than 200 measuring stations, we can work with that.

-
-
-

Analyzing sensor data

-

Having analyzed the available data sources, let’s finally get some measurements. We could call osem_measurements(pm25_sensors) now, however we are focussing on a restricted area of interest, the city of Berlin. Luckily we can get the measurements filtered by a bounding box:

-
library(sf)
-
## Linking to GEOS 3.5.1, GDAL 2.2.2, proj.4 4.9.2
-
library(units)
-
## 
-## Attaching package: 'units'
-
## The following object is masked from 'package:base':
-## 
-##     %*%
-
library(lubridate)
-library(dplyr)
-
-# construct a bounding box: 12 kilometers around Berlin
-berlin = st_point(c(13.4034, 52.5120)) %>%
-  st_sfc(crs = 4326) %>%
-  st_transform(3857) %>% # allow setting a buffer in meters
-  st_buffer(set_units(12, km)) %>%
-  st_transform(4326) %>% # the opensensemap expects WGS 84
-  st_bbox()
-
pm25 = osem_measurements(
-  berlin,
-  phenomenon = 'PM2.5',
-  from = now() - days(20), # defaults to 2 days
-  to = now()
-)
-
-plot(pm25)
-

-

Now we can get started with actual spatiotemporal data analysis. First, lets mask the seemingly uncalibrated sensors:

-
outliers = filter(pm25, value > 100)$sensorId
-bad_sensors = outliers[, drop = T] %>% levels()
-
-pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
-

Then plot the measuring locations, flagging the outliers:

-
st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
-

-

Removing these sensors yields a nicer time series plot:

-
pm25 %>% filter(invalid == FALSE) %>% plot()
-

-

Further analysis: comparison with LANUV data TODO

-
- - - - - - - - diff --git a/inst/doc/osem-serialization.R b/inst/doc/osem-serialization.R deleted file mode 100644 index 381f7a0..0000000 --- a/inst/doc/osem-serialization.R +++ /dev/null @@ -1,51 +0,0 @@ -## ----setup, results='hide'----------------------------------------------- -# this vignette requires: -library(opensensmapr) -library(jsonlite) -library(readr) - -## ----cache--------------------------------------------------------------- -b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) - -# the next identical request will hit the cache only! -b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) - -# requests without the cache parameter will still be performed normally -b = osem_boxes(grouptag = 'ifgi') - -## ----cachelisting-------------------------------------------------------- -list.files(tempdir(), pattern = 'osemcache\\..*\\.rds') - -## ----cache_custom-------------------------------------------------------- -cacheDir = getwd() # current working directory -b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) - -# the next identical request will hit the cache only! -b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) - -## ----clearcache---------------------------------------------------------- -osem_clear_cache() # clears default cache -osem_clear_cache(getwd()) # clears a custom cache - -## ----data, results='hide'------------------------------------------------ -# first get our example data: -measurements = osem_measurements('Windrichtung') - -## ----serialize_json------------------------------------------------------ -# serializing senseBoxes to JSON, and loading from file again: -write(jsonlite::serializeJSON(measurements), 'measurements.json') -measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json')) -class(measurements_from_file) - -## ----serialize_attrs----------------------------------------------------- -# note the toJSON call instead of serializeJSON -write(jsonlite::toJSON(measurements), 'measurements_bad.json') -measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json') -class(measurements_without_attrs) - -measurements_with_attrs = osem_as_measurements(measurements_without_attrs) -class(measurements_with_attrs) - -## ----cleanup, include=FALSE---------------------------------------------- -file.remove('measurements.json', 'measurements_bad.json') - diff --git a/inst/doc/osem-serialization.Rmd b/inst/doc/osem-serialization.Rmd deleted file mode 100644 index 77eae02..0000000 --- a/inst/doc/osem-serialization.Rmd +++ /dev/null @@ -1,106 +0,0 @@ ---- -title: "Caching openSenseMap Data for Reproducibility" -author: "Norwin Roosen" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Caching openSenseMap Data for Reproducibility} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -It may be useful to download data from openSenseMap only once. -For reproducible results, the data should be saved to disk, and reloaded at a -later point. - -This avoids.. - -- changed results for queries without date parameters, -- unnecessary wait times, -- risk of API changes / API unavailability, -- stress on the openSenseMap-server. - -This vignette shows how to use this built in `opensensmapr` feature, and -how to do it yourself in case you want to save to other data formats. - -```{r setup, results='hide'} -# this vignette requires: -library(opensensmapr) -library(jsonlite) -library(readr) -``` - -## Using the opensensmapr Caching Feature -All data retrieval functions of `opensensmapr` have a built in caching feature, -which serializes an API response to disk. -Subsequent identical requests will then return the serialized data instead of making -another request. - -To use this feature, just add a path to a directory to the `cache` parameter: -```{r cache} -b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) - -# the next identical request will hit the cache only! -b = osem_boxes(grouptag = 'ifgi', cache = tempdir()) - -# requests without the cache parameter will still be performed normally -b = osem_boxes(grouptag = 'ifgi') -``` - -Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL: -```{r cachelisting} -list.files(tempdir(), pattern = 'osemcache\\..*\\.rds') -``` - -You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory: -```{r cache_custom} -cacheDir = getwd() # current working directory -b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) - -# the next identical request will hit the cache only! -b = osem_boxes(grouptag = 'ifgi', cache = cacheDir) -``` - -To get fresh results again, just call `osem_clear_cache()` for the respective cache: -```{r clearcache, results='hide'} -osem_clear_cache() # clears default cache -osem_clear_cache(getwd()) # clears a custom cache -``` - -## Custom (De-) Serialization -If you want to roll your own serialization method to support custom data formats, -here's how: - -```{r data, results='hide'} -# first get our example data: -measurements = osem_measurements('Windrichtung') -``` - -If you are paranoid and worry about `.rds` files not being decodable anymore -in the (distant) future, you could serialize to a plain text format such as JSON. -This of course comes at the cost of storage space and performance. -```{r serialize_json} -# serializing senseBoxes to JSON, and loading from file again: -write(jsonlite::serializeJSON(measurements), 'measurements.json') -measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json')) -class(measurements_from_file) -``` - -This method also persists the R object metadata (classes, attributes). -If you were to use a serialization method that can't persist object metadata, you -could re-apply it with the following functions: - -```{r serialize_attrs} -# note the toJSON call instead of serializeJSON -write(jsonlite::toJSON(measurements), 'measurements_bad.json') -measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json') -class(measurements_without_attrs) - -measurements_with_attrs = osem_as_measurements(measurements_without_attrs) -class(measurements_with_attrs) -``` -The same goes for boxes via `osem_as_sensebox()`. - -```{r cleanup, include=FALSE} -file.remove('measurements.json', 'measurements_bad.json') -``` diff --git a/inst/doc/osem-serialization.html b/inst/doc/osem-serialization.html deleted file mode 100644 index 42c9d4f..0000000 --- a/inst/doc/osem-serialization.html +++ /dev/null @@ -1,152 +0,0 @@ - - - - - - - - - - - - - - - - -Caching openSenseMap Data for Reproducibility - - - - - - - - - - - - - - - - - -

Caching openSenseMap Data for Reproducibility

-

Norwin Roosen

-

2018-06-07

- - - -

It may be useful to download data from openSenseMap only once. For reproducible results, the data should be saved to disk, and reloaded at a later point.

-

This avoids..

- -

This vignette shows how to use this built in opensensmapr feature, and how to do it yourself in case you want to save to other data formats.

-
# this vignette requires:
-library(opensensmapr)
-library(jsonlite)
-library(readr)
-
-

Using the opensensmapr Caching Feature

-

All data retrieval functions of opensensmapr have a built in caching feature, which serializes an API response to disk. Subsequent identical requests will then return the serialized data instead of making another request.

-

To use this feature, just add a path to a directory to the cache parameter:

-
b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = tempdir())
-
-# requests without the cache parameter will still be performed normally
-b = osem_boxes(grouptag = 'ifgi')
-

Looking at the cache directory we can see one file for each request, which is identified through a hash of the request URL:

-
list.files(tempdir(), pattern = 'osemcache\\..*\\.rds')
-
## [1] "osemcache.17db5c57fc6fca4d836fa2cf30345ce8767cd61a.rds"
-

You can maintain multiple caches simultaneously which allows to only store data related to a script in the same directory:

-
cacheDir = getwd() # current working directory
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-
-# the next identical request will hit the cache only!
-b = osem_boxes(grouptag = 'ifgi', cache = cacheDir)
-

To get fresh results again, just call osem_clear_cache() for the respective cache:

-
osem_clear_cache()        # clears default cache
-osem_clear_cache(getwd()) # clears a custom cache
-
-
-

Custom (De-) Serialization

-

If you want to roll your own serialization method to support custom data formats, here’s how:

-
# first get our example data:
-measurements = osem_measurements('Windrichtung')
-

If you are paranoid and worry about .rds files not being decodable anymore in the (distant) future, you could serialize to a plain text format such as JSON. This of course comes at the cost of storage space and performance.

-
# serializing senseBoxes to JSON, and loading from file again:
-write(jsonlite::serializeJSON(measurements), 'measurements.json')
-measurements_from_file = jsonlite::unserializeJSON(readr::read_file('measurements.json'))
-class(measurements_from_file)
-
## [1] "osem_measurements" "tbl_df"            "tbl"              
-## [4] "data.frame"
-

This method also persists the R object metadata (classes, attributes). If you were to use a serialization method that can’t persist object metadata, you could re-apply it with the following functions:

-
# note the toJSON call instead of serializeJSON
-write(jsonlite::toJSON(measurements), 'measurements_bad.json')
-measurements_without_attrs = jsonlite::fromJSON('measurements_bad.json')
-class(measurements_without_attrs)
-
## [1] "data.frame"
-
measurements_with_attrs = osem_as_measurements(measurements_without_attrs)
-class(measurements_with_attrs)
-
## [1] "osem_measurements" "tbl_df"            "tbl"              
-## [4] "data.frame"
-

The same goes for boxes via osem_as_sensebox().

-
- - - - - - - -