From 35d9ee697bc4bcb6d0062bb9fe17ffc3c72cbe58 Mon Sep 17 00:00:00 2001 From: jan Date: Wed, 1 Feb 2023 09:37:38 +0100 Subject: [PATCH] history Rmd revised --- vignettes/osem-history.Rmd | 2 + vignettes/osem-history_revised.Rmd | 302 +++++++++++++++++++++++++++++ 2 files changed, 304 insertions(+) create mode 100644 vignettes/osem-history_revised.Rmd diff --git a/vignettes/osem-history.Rmd b/vignettes/osem-history.Rmd index 12aff6a..a04e391 100644 --- a/vignettes/osem-history.Rmd +++ b/vignettes/osem-history.Rmd @@ -30,6 +30,8 @@ library(dplyr) # data wrangling library(ggplot2) # plotting library(lubridate) # date arithmetic library(zoo) # rollmean() +library(devtools) +load_all(".") ``` openSenseMap.org has grown quite a bit in the last years; it would be interesting diff --git a/vignettes/osem-history_revised.Rmd b/vignettes/osem-history_revised.Rmd new file mode 100644 index 0000000..1820d6c --- /dev/null +++ b/vignettes/osem-history_revised.Rmd @@ -0,0 +1,302 @@ +--- +title: "Visualising the Develpment of openSenseMap.org in 2022" +author: "Jan Stenkamp" +date: '`r Sys.Date()`' +output: + html_document: + code_folding: hide + df_print: kable + theme: lumen + toc: yes + toc_float: yes + rmarkdown::html_vignette: + df_print: kable + fig_height: 5 + fig_width: 7 + toc: yes +vignette: > + %\VignetteIndexEntry{Visualising the History of openSenseMap.org} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +--- + +> This vignette serves as an example on data wrangling & visualization with +`opensensmapr`, `dplyr` and `ggplot2`. + +```{r setup, results='hide', message=FALSE, warning=FALSE} +# required packages: +# library(opensensmapr) # data download +library(devtools) +load_all(".") +library(dplyr) # data wrangling +library(ggplot2) # plotting +library(lubridate) # date arithmetic +library(zoo) # rollmean() +``` + +openSenseMap.org has grown quite a bit in the last years; it would be interesting +to see how we got to the current `r osem_counts()$boxes` sensor stations, +split up by various attributes of the boxes. + +While `opensensmapr` provides extensive methods of filtering boxes by attributes +on the server, we do the filtering within R to save time and gain flexibility. + + +So the first step is to retrieve *all the boxes*. + +```{r download, results='hide', message=FALSE, warning=FALSE} +# if you want to see results for a specific subset of boxes, +# just specify a filter such as grouptag='ifgi' here +boxes_all = osem_boxes() +boxes = boxes_all +``` +# Introduction +In the following we just want to have a look at the boxes created in 2022, so we filter for them. + +```{r} +boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31") +summary(boxes) -> summary.data.frame +``` + + + + + + + +Another feature of interest is the spatial distribution of the boxes: `plot()` +can help us out here. This function requires a bunch of optional dependencies though. + +```{r message=F, warning=F} +if (!require('maps')) install.packages('maps') +if (!require('maptools')) install.packages('maptools') +if (!require('rgeos')) install.packages('rgeos') + +plot(boxes) +``` + +But what do these sensor stations actually measure? Lets find out. +`osem_phenomena()` gives us a named list of of the counts of each observed +phenomenon for the given set of sensor stations: + +```{r} +phenoms = osem_phenomena(boxes) +str(phenoms) +``` + +Thats quite some noise there, with many phenomena being measured by a single +sensor only, or many duplicated phenomena due to slightly different spellings. +We should clean that up, but for now let's just filter out the noise and find +those phenomena with high sensor numbers: + +```{r} +phenoms[phenoms > 50] +``` + + +# Plot count of boxes by time {.tabset} +By looking at the `createdAt` attribute of each box we know the exact time a box +was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date. + +With this approach we have no information about boxes that were deleted in the +meantime, but that's okay for now. + +## ...and exposure +```{r exposure_counts, message=FALSE} +exposure_counts = boxes %>% + group_by(exposure) %>% + mutate(count = row_number(locationtimestamp)) + +exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey') +ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) + + geom_line() + + scale_colour_manual(values = exposure_colors) + + xlab('Registration Date') + ylab('senseBox count') +``` + +Outdoor boxes are growing *fast*! +We can also see the introduction of `mobile` sensor "stations" in 2017. + +Let's have a quick summary: +```{r exposure_summary} +exposure_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +## ...and grouptag +We can try to find out where the increases in growth came from, by analysing the +box count by grouptag. + +Caveats: Only a small subset of boxes has a grouptag, and we should assume +that these groups are actually bigger. Also, we can see that grouptag naming is +inconsistent (`Luftdaten`, `luftdaten.info`, ...) + +```{r grouptag_counts, message=FALSE} +grouptag_counts = boxes %>% + group_by(grouptag) %>% + # only include grouptags with 15 or more members + filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>% + mutate(count = row_number(locationtimestamp)) + +# helper for sorting the grouptags by boxcount +sortLvls = function(oldFactor, ascending = TRUE) { + lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names() + factor(oldFactor, levels = lvls) +} +grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE) + +ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) + + geom_line(aes(group = grouptag)) + + xlab('Registration Date') + ylab('senseBox count') +``` + +```{r grouptag_summary} +grouptag_counts %>% + summarise( + oldest = min(locationtimestamp), + newest = max(locationtimestamp), + count = max(count) + ) %>% + arrange(desc(count)) +``` + +# Plot rate of growth and inactivity per week +First we group the boxes by `locationtimestamp` into bins of one week: +```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'} +bins = 'week' +mvavg_bins = 6 + +growth = boxes %>% + mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'registered') +``` + +We can do the same for `updatedAt`, which informs us about the last change to +a box, including uploaded measurements. As a lot of boxes were "updated" by the database +migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement` +attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes +boxes which were created but never made a measurement. + +This method of determining inactive boxes is fairly inaccurate and should be +considered an approximation, because we have no information about intermediate +inactive phases. +Also deleted boxes would probably have a big impact here. +```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'} +inactive = boxes %>% + # remove boxes that were updated in the last two days, + # b/c any box becomes inactive at some point by definition of updatedAt + filter(lastMeasurement < now() - days(2)) %>% + mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>% + filter(as.Date(week) > as.Date("2021-12-31")) %>% + group_by(week) %>% + summarize(count = length(week)) %>% + mutate(event = 'inactive') +``` + +Now we can combine both datasets for plotting: +```{r growthrate, warning=FALSE, message=FALSE, results='hide'} +boxes_by_date = bind_rows(growth, inactive) %>% group_by(event) + +ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) + + xlab('Time') + ylab(paste('rate per ', bins)) + + scale_x_date(date_breaks="years", date_labels="%Y") + + scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) + + geom_point(aes(y = count), size = 0.5) + + # moving average, make first and last value NA (to ensure identical length of vectors) + geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA)))) +``` + +And see in which weeks the most boxes become (in)active: +```{r table_mostregistrations} +boxes_by_date %>% + filter(count > 50) %>% + arrange(desc(count)) +``` + +# Plot duration of boxes being active {.tabset} +While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity +of each box, and look at metrics by exposure and grouptag once more: + +## ...by exposure +```{r exposure_duration, message=FALSE} +durations = boxes %>% + group_by(exposure) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = exposure, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') +``` + +The time of activity averages at only `r round(mean(durations$duration))` days, +though there are boxes with `r round(max(durations$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by grouptag +```{r grouptag_duration, message=FALSE} +durations = boxes %>% + filter(!is.na(lastMeasurement)) %>% + group_by(grouptag) %>% + # only include grouptags with 20 or more members + filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(durations, aes(x = grouptag, y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + +durations %>% + summarize( + duration_avg = round(mean(duration)), + duration_min = round(min(duration)), + duration_max = round(max(duration)), + oldest_box = round(max(difftime(now(), locationtimestamp, units='days'))) + ) %>% + arrange(desc(duration_avg)) +``` + +The time of activity averages at only `r round(mean(durations$duration))` days, +though there are boxes with `r round(max(durations$duration))` days of activity, +spanning a large chunk of openSenseMap's existence. + +## ...by year of registration +This is less useful, as older boxes are active for a longer time by definition. +If you have an idea how to compensate for that, please send a [Pull Request][PR]! + +```{r year_duration, message=FALSE} +# NOTE: boxes older than 2016 missing due to missing updatedAt in database +duration = boxes %>% + mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>% + group_by(year) %>% + filter(!is.na(lastMeasurement)) %>% + mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>% + filter(duration >= 0) + +ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) + + geom_boxplot() + + coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration') +``` + +# More Visualisations +Other visualisations come to mind, and are left as an exercise to the reader. +If you implemented some, feel free to add them to this vignette via a [Pull Request][PR]. + +* growth by phenomenon +* growth by location -> (interactive) map +* set inactive rate in relation to total box count +* filter timespans with big dips in growth rate, and extrapolate the amount of + senseBoxes that could be on the platform today, assuming there were no production issues ;) + +[PR]: https://github.com/sensebox/opensensmapr/pulls + +