Compare commits

...

26 Commits

@ -1,6 +1,6 @@
^.*\.Rproj$
^\.Rproj\.user$
^CHANGES\.md$
^NEWS\.md$
^tools*$
^\.travis\.yml$
^appveyor\.yml$
@ -8,3 +8,4 @@
^codecov\.yml$
^\.lintr$
^opensensmapr_.*\.tar\.gz$
^cran-comments\.md$

1
.gitignore vendored

@ -5,5 +5,6 @@
.Ruserdata
*.Rcheck
*.log
cran-comments.md
opensensmapr_*.tar.gz

@ -1,9 +1,9 @@
Package: opensensmapr
Type: Package
Title: Client for the Data API of openSenseMap.org
Version: 0.5.1
URL: http://github.com/sensebox/opensensmapR
BugReports: http://github.com/sensebox/opensensmapR/issues
Version: 0.6.0
URL: https://github.com/sensebox/opensensmapR
BugReports: https://github.com/sensebox/opensensmapR/issues
Imports:
dplyr,
httr,
@ -28,8 +28,9 @@ Suggests:
lintr,
testthat,
covr
Authors@R: c(person("Norwin", "Roosen", role = c("aut", "cre"), email = "hello@nroo.de"),
person("Daniel", "Nuest", role = c("ctb"), email = "daniel.nuest@uni-muenster.de", comment = c(ORCID = "0000-0003-2392-6140")))
Authors@R: c(person("Norwin", "Roosen", role = c("aut"), email = "hello@nroo.de"),
person("Daniel", "Nuest", role = c("ctb"), email = "daniel.nuest@uni-muenster.de", comment = c(ORCID = "0000-0003-2392-6140")),
person("Jan", "Stenkamp", role = c("ctb", "cre"), email = "jan.stenkamp@uni-muenster.de"))
Description: Download environmental measurements and sensor station metadata
from the API of open data sensor web platform <https://opensensemap.org> for
analysis in R.
@ -41,5 +42,5 @@ Description: Download environmental measurements and sensor station metadata
License: GPL (>= 2) | file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
RoxygenNote: 7.2.3
VignetteBuilder: knitr

@ -1,6 +1,17 @@
# opensensmapr changelog
This project does its best to adhere to semantic versioning.
### 2023-02-20: v0.6.0
- fix package bugs to pass CRAN tests after 4 years of maintenance break
- updated hyperlinks
- dont throw error for empty sensors
- updated tests
- updated maintainer
- updated vignettes
- new features:
- added param bbox for osem_boxes function
- support of multiple grouptags
### 2019-02-09: v0.5.1
- fix package to work with API v6
- box$lastMeasurement may be missing now for long inactive boxes

@ -45,8 +45,9 @@ get_boxes_ = function (..., endpoint) {
df = dplyr::bind_rows(boxesList)
df$exposure = df$exposure %>% as.factor()
df$model = df$model %>% as.factor()
if (!is.null(df$grouptag))
if (!is.null(df$grouptag)){
df$grouptag = df$grouptag %>% as.factor()
}
df
}
@ -70,7 +71,7 @@ parse_measurement_csv = function (resText) {
})
osem_as_measurements(result)
}
}
get_measurements_ = function (..., endpoint) {
osem_get_resource(endpoint, c('boxes', 'data'), ..., type = 'text') %>%
@ -120,11 +121,12 @@ osem_cache_filename = function (path, query = list(), host = osem_endpoint()) {
#'
#' @export
#' @examples
#' \donttest{
#' \dontrun{
#' osem_boxes(cache = tempdir())
#' osem_clear_cache()
#'
#' cachedir = paste(getwd(), 'osemcache', sep = '/')
#' dir.create(file.path(cachedir), showWarnings = FALSE)
#' osem_boxes(cache = cachedir)
#' osem_clear_cache(cachedir)
#' }

@ -12,7 +12,7 @@ osem_archive_endpoint = function () {
#'
#' This function is significantly faster than \code{\link{osem_measurements}} for large
#' time-frames, as daily CSV dumps for each sensor from
#' \href{http://archive.opensensemap.org}{archive.opensensemap.org} are used.
#' \href{https://archive.opensensemap.org}{archive.opensensemap.org} are used.
#' Note that the latest data available is from the previous day.
#'
#' By default, data for all sensors of a box is fetched, but you can select a
@ -115,7 +115,7 @@ archive_fetch_measurements = function (box, sensorId, fromDate, toDate, progress
))
if (httr::status_code(res) == 404)
return(data.frame(createdAt = character(), value = character()))
return(data.frame(createdAt = as.POSIXlt(x = integer(0), origin = date), value = double()))
}
measurements = httr::content(res, type = 'text', encoding = 'UTF-8') %>%

@ -18,6 +18,10 @@
#' @param to Only return boxes that were measuring earlier than this time
#' @param phenomenon Only return boxes that measured the given phenomenon in the
#' time interval as specified through \code{date} or \code{from / to}
#' @param bbox Only return boxes that are within the given boundingbox,
#' vector of 4 WGS84 coordinates.
#' Order is: longitude southwest, latitude southwest, longitude northeast, latitude northeast.
#' Minimal and maximal values are: -180, 180 for longitude and -90, 90 for latitude.
#' @param endpoint The URL of the openSenseMap API instance
#' @param progress Whether to print download progress information, defaults to \code{TRUE}
#' @param cache Whether to cache the result, defaults to false.
@ -33,7 +37,7 @@
#' @export
#' @examples
#'
#' \donttest{
#' \dontrun{
#' # get *all* boxes available on the API
#' b = osem_boxes()
#'
@ -67,7 +71,8 @@
#' b = osem_boxes(progress = FALSE)
#' }
osem_boxes = function (exposure = NA, model = NA, grouptag = NA,
date = NA, from = NA, to = NA, phenomenon = NA,
date = NA, from = NA, to = NA, phenomenon = NA,
bbox = NA,
endpoint = osem_endpoint(),
progress = TRUE,
cache = NA) {
@ -93,12 +98,14 @@ osem_boxes = function (exposure = NA, model = NA, grouptag = NA,
if (!is.na(model)) query$model = model
if (!is.na(grouptag)) query$grouptag = grouptag
if (!is.na(phenomenon)) query$phenomenon = phenomenon
if (all(!is.na(bbox))) query$bbox = paste(bbox, collapse = ', ')
if (!is.na(to) && !is.na(from))
query$date = parse_dateparams(from, to) %>% paste(collapse = ',')
else if (!is.na(date))
query$date = date_as_utc(date) %>% date_as_isostring()
do.call(get_boxes_, query)
}
@ -118,7 +125,7 @@ osem_boxes = function (exposure = NA, model = NA, grouptag = NA,
#' @seealso \code{\link{osem_clear_cache}}
#' @export
#' @examples
#' \donttest{
#' \dontrun{
#' # get a specific box by ID
#' b = osem_box('57000b8745fd40c8196ad04c')
#'
@ -148,7 +155,10 @@ parse_senseboxdata = function (boxdata) {
sensors = boxdata$sensors
location = boxdata$currentLocation
lastMeasurement = boxdata$lastMeasurementAt # rename for backwards compat < 0.5.1
boxdata[c('loc', 'locations', 'currentLocation', 'sensors', 'image', 'boxType', 'lastMeasurementAt')] = NULL
grouptags = boxdata$grouptag
boxdata[c(
'loc', 'locations', 'currentLocation', 'sensors', 'image', 'boxType', 'lastMeasurementAt', 'grouptag'
)] = NULL
thebox = as.data.frame(boxdata, stringsAsFactors = F)
# parse timestamps (updatedAt might be not defined)
@ -158,6 +168,11 @@ parse_senseboxdata = function (boxdata) {
if (!is.null(lastMeasurement))
thebox$lastMeasurement = isostring_as_date(lastMeasurement)
# add empty sensortype to sensors without type
if(!('sensorType' %in% names(sensors[[1]]))) {
sensors[[1]]$sensorType <- NA
}
# create a dataframe of sensors
thebox$sensors = sensors %>%
recursive_lapply(function (x) if (is.null(x)) NA else x) %>% # replace NULLs with NA
@ -175,9 +190,27 @@ parse_senseboxdata = function (boxdata) {
# extract coordinates & transform to simple feature object
thebox$lon = location$coordinates[[1]]
thebox$lat = location$coordinates[[2]]
thebox$locationtimestamp = isostring_as_date(location$timestamp)
if (length(location$coordinates) == 3)
thebox$height = location$coordinates[[3]]
# extract grouptag(s) from box
if (length(grouptags) == 0)
thebox$grouptag = NULL
if (length(grouptags) > 0) {
# if box does not have grouptag dont set attribute
if(grouptags[[1]] == '') {
thebox$grouptag = NULL
}
else {
thebox$grouptag = grouptags[[1]]
}
}
if (length(grouptags) > 1)
thebox$grouptag2 = grouptags[[2]]
if (length(grouptags) > 2)
thebox$grouptag3 = grouptags[[3]]
# attach a custom class for methods
osem_as_sensebox(thebox)
}

@ -39,7 +39,7 @@ osem_measurements = function (x, ...) UseMethod('osem_measurements')
#' @describeIn osem_measurements Get measurements from \strong{all} senseBoxes.
#' @export
#' @examples
#' \donttest{
#' \dontrun{
#' # get measurements from all boxes on the phenomenon 'PM10' from the last 48h
#' m = osem_measurements('PM10')
#'
@ -80,6 +80,7 @@ osem_measurements.default = function (x, ...) {
#' # construct a bounding box 12km around berlin using the sf package,
#' # and get measurements from stations within that box
#' library(sf)
#' library(units)
#' bbox2 = st_point(c(13.4034, 52.5120)) %>%
#' st_sfc(crs = 4326) %>%
#' st_transform(3857) %>% # allow setting a buffer in meters

@ -18,7 +18,7 @@ print.osem_measurements = function (x, ...) {
#' Should have at least a `value` and `createdAt` column.
#' @export
osem_as_measurements = function(x) {
ret = tibble::as.tibble(x)
ret = tibble::as_tibble(x)
class(ret) = c('osem_measurements', class(ret))
ret
}

@ -34,9 +34,9 @@ There are also vignettes showcasing applications of this package:
- [Exploring the openSenseMap dataset][osem-intro]: Showcase of included helper functions
- [Caching openSenseMap Data for reproducibility][osem-serialization]
[osem-intro]: https://sensebox.github.com/opensensmapR/inst/doc/osem-intro.html
[osem-history]: https://sensebox.github.com/opensensmapR/inst/doc/osem-history.html
[osem-serialization]: https://sensebox.github.com/opensensmapR/inst/doc/osem-serialization.html
[osem-intro]: https://sensebox.github.io/opensensmapR/inst/doc/osem-intro.html
[osem-history]: https://sensebox.github.io/opensensmapR/inst/doc/osem-history.html
[osem-serialization]: https://sensebox.github.io/opensensmapR/inst/doc/osem-serialization.html
If you used this package for an analysis and think it could serve as a good
example or showcase, feel free to add a vignette to the package via a [PR](#contribute)!

@ -0,0 +1,302 @@
---
title: "Visualising the Develpment of openSenseMap.org in 2022"
author: "Jan Stenkamp"
date: '`r Sys.Date()`'
output:
html_document:
code_folding: hide
df_print: kable
theme: lumen
toc: yes
toc_float: yes
rmarkdown::html_vignette:
df_print: kable
fig_height: 5
fig_width: 7
toc: yes
vignette: >
%\VignetteIndexEntry{Visualising the History of openSenseMap.org}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
---
> This vignette serves as an example on data wrangling & visualization with
`opensensmapr`, `dplyr` and `ggplot2`.
```{r setup, results='hide', message=FALSE, warning=FALSE}
# required packages:
# library(opensensmapr) # data download
library(devtools)
load_all(".")
library(dplyr) # data wrangling
library(ggplot2) # plotting
library(lubridate) # date arithmetic
library(zoo) # rollmean()
```
openSenseMap.org has grown quite a bit in the last years; it would be interesting
to see how we got to the current `r osem_counts()$boxes` sensor stations,
split up by various attributes of the boxes.
While `opensensmapr` provides extensive methods of filtering boxes by attributes
on the server, we do the filtering within R to save time and gain flexibility.
So the first step is to retrieve *all the boxes*.
```{r download, results='hide', message=FALSE, warning=FALSE}
# if you want to see results for a specific subset of boxes,
# just specify a filter such as grouptag='ifgi' here
boxes_all = osem_boxes()
boxes = boxes_all
```
# Introduction
In the following we just want to have a look at the boxes created in 2022, so we filter for them.
```{r}
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
summary(boxes) -> summary.data.frame
```
<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
<!-- outdoors and have around 5 sensors each. -->
<!-- The oldest station is from August 2016, while the latest station was registered a -->
<!-- couple of minutes ago. -->
Another feature of interest is the spatial distribution of the boxes: `plot()`
can help us out here. This function requires a bunch of optional dependencies though.
```{r message=F, warning=F}
if (!require('maps')) install.packages('maps')
if (!require('maptools')) install.packages('maptools')
if (!require('rgeos')) install.packages('rgeos')
plot(boxes)
```
But what do these sensor stations actually measure? Lets find out.
`osem_phenomena()` gives us a named list of of the counts of each observed
phenomenon for the given set of sensor stations:
```{r}
phenoms = osem_phenomena(boxes)
str(phenoms)
```
Thats quite some noise there, with many phenomena being measured by a single
sensor only, or many duplicated phenomena due to slightly different spellings.
We should clean that up, but for now let's just filter out the noise and find
those phenomena with high sensor numbers:
```{r}
phenoms[phenoms > 50]
```
# Plot count of boxes by time {.tabset}
By looking at the `createdAt` attribute of each box we know the exact time a box
was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
With this approach we have no information about boxes that were deleted in the
meantime, but that's okay for now.
## ...and exposure
```{r exposure_counts, message=FALSE}
exposure_counts = boxes %>%
group_by(exposure) %>%
mutate(count = row_number(locationtimestamp))
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
geom_line() +
scale_colour_manual(values = exposure_colors) +
xlab('Registration Date') + ylab('senseBox count')
```
Outdoor boxes are growing *fast*!
We can also see the introduction of `mobile` sensor "stations" in 2017.
Let's have a quick summary:
```{r exposure_summary}
exposure_counts %>%
summarise(
oldest = min(locationtimestamp),
newest = max(locationtimestamp),
count = max(count)
) %>%
arrange(desc(count))
```
## ...and grouptag
We can try to find out where the increases in growth came from, by analysing the
box count by grouptag.
Caveats: Only a small subset of boxes has a grouptag, and we should assume
that these groups are actually bigger. Also, we can see that grouptag naming is
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
```{r grouptag_counts, message=FALSE}
grouptag_counts = boxes %>%
group_by(grouptag) %>%
# only include grouptags with 15 or more members
filter(length(grouptag) >= 15 && !is.na(grouptag) && grouptag != '') %>%
mutate(count = row_number(locationtimestamp))
# helper for sorting the grouptags by boxcount
sortLvls = function(oldFactor, ascending = TRUE) {
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
factor(oldFactor, levels = lvls)
}
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
geom_line(aes(group = grouptag)) +
xlab('Registration Date') + ylab('senseBox count')
```
```{r grouptag_summary}
grouptag_counts %>%
summarise(
oldest = min(locationtimestamp),
newest = max(locationtimestamp),
count = max(count)
) %>%
arrange(desc(count))
```
# Plot rate of growth and inactivity per week
First we group the boxes by `locationtimestamp` into bins of one week:
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
bins = 'week'
mvavg_bins = 6
growth = boxes %>%
mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
group_by(week) %>%
summarize(count = length(week)) %>%
mutate(event = 'registered')
```
We can do the same for `updatedAt`, which informs us about the last change to
a box, including uploaded measurements. As a lot of boxes were "updated" by the database
migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement`
attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes
boxes which were created but never made a measurement.
This method of determining inactive boxes is fairly inaccurate and should be
considered an approximation, because we have no information about intermediate
inactive phases.
Also deleted boxes would probably have a big impact here.
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
inactive = boxes %>%
# remove boxes that were updated in the last two days,
# b/c any box becomes inactive at some point by definition of updatedAt
filter(lastMeasurement < now() - days(2)) %>%
mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
filter(as.Date(week) > as.Date("2021-12-31")) %>%
group_by(week) %>%
summarize(count = length(week)) %>%
mutate(event = 'inactive')
```
Now we can combine both datasets for plotting:
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
xlab('Time') + ylab(paste('rate per ', bins)) +
scale_x_date(date_breaks="years", date_labels="%Y") +
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
geom_point(aes(y = count), size = 0.5) +
# moving average, make first and last value NA (to ensure identical length of vectors)
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
```
And see in which weeks the most boxes become (in)active:
```{r table_mostregistrations}
boxes_by_date %>%
filter(count > 50) %>%
arrange(desc(count))
```
# Plot duration of boxes being active {.tabset}
While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
of each box, and look at metrics by exposure and grouptag once more:
## ...by exposure
```{r exposure_duration, message=FALSE}
durations = boxes %>%
group_by(exposure) %>%
filter(!is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(durations, aes(x = exposure, y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days')
```
The time of activity averages at only `r round(mean(durations$duration))` days,
though there are boxes with `r round(max(durations$duration))` days of activity,
spanning a large chunk of openSenseMap's existence.
## ...by grouptag
```{r grouptag_duration, message=FALSE}
durations = boxes %>%
filter(!is.na(lastMeasurement)) %>%
group_by(grouptag) %>%
# only include grouptags with 20 or more members
filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(durations, aes(x = grouptag, y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days')
durations %>%
summarize(
duration_avg = round(mean(duration)),
duration_min = round(min(duration)),
duration_max = round(max(duration)),
oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
) %>%
arrange(desc(duration_avg))
```
The time of activity averages at only `r round(mean(durations$duration))` days,
though there are boxes with `r round(max(durations$duration))` days of activity,
spanning a large chunk of openSenseMap's existence.
## ...by year of registration
This is less useful, as older boxes are active for a longer time by definition.
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
```{r year_duration, message=FALSE}
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
duration = boxes %>%
mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
group_by(year) %>%
filter(!is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
```
# More Visualisations
Other visualisations come to mind, and are left as an exercise to the reader.
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
* growth by phenomenon
* growth by location -> (interactive) map
* set inactive rate in relation to total box count
* filter timespans with big dips in growth rate, and extrapolate the amount of
senseBoxes that could be on the platform today, assuming there were no production issues ;)
[PR]: https://github.com/sensebox/opensensmapr/pulls

File diff suppressed because one or more lines are too long

@ -112,11 +112,16 @@ openSenseMap API: \url{https://api.opensensemap.org/}
official openSenseMap API documentation: \url{https://docs.opensensemap.org/}
}
\author{
\strong{Maintainer}: Norwin Roosen \email{hello@nroo.de}
\strong{Maintainer}: Jan Stenkamp \email{jan.stenkamp@uni-muenster.de} [contributor]
Authors:
\itemize{
\item Norwin Roosen \email{hello@nroo.de}
}
Other contributors:
\itemize{
\item Daniel Nuest \email{daniel.nuest@uni-muenster.de} (0000-0003-2392-6140) [contributor]
\item Daniel Nuest \email{daniel.nuest@uni-muenster.de} (\href{https://orcid.org/0000-0003-2392-6140}{ORCID}) [contributor]
}
}

@ -21,7 +21,7 @@ A \code{sensebox data.frame} containing a box in each row
Get a single senseBox by its ID
}
\examples{
\donttest{
\dontrun{
# get a specific box by ID
b = osem_box('57000b8745fd40c8196ad04c')

@ -4,9 +4,19 @@
\alias{osem_boxes}
\title{Get a set of senseBoxes from the openSenseMap}
\usage{
osem_boxes(exposure = NA, model = NA, grouptag = NA, date = NA,
from = NA, to = NA, phenomenon = NA, endpoint = osem_endpoint(),
progress = TRUE, cache = NA)
osem_boxes(
exposure = NA,
model = NA,
grouptag = NA,
date = NA,
from = NA,
to = NA,
phenomenon = NA,
bbox = NA,
endpoint = osem_endpoint(),
progress = TRUE,
cache = NA
)
}
\arguments{
\item{exposure}{Only return boxes with the given exposure ('indoor', 'outdoor', 'mobile')}
@ -24,6 +34,11 @@ osem_boxes(exposure = NA, model = NA, grouptag = NA, date = NA,
\item{phenomenon}{Only return boxes that measured the given phenomenon in the
time interval as specified through \code{date} or \code{from / to}}
\item{bbox}{Only return boxes that are within the given boundingbox,
vector of 4 WGS84 coordinates.
Order is: longitude southwest, latitude southwest, longitude northeast, latitude northeast.
Minimal and maximal values are: -180, 180 for longitude and -90, 90 for latitude.}
\item{endpoint}{The URL of the openSenseMap API instance}
\item{progress}{Whether to print download progress information, defaults to \code{TRUE}}
@ -46,7 +61,7 @@ Note that some filters do not work together:
}
\examples{
\donttest{
\dontrun{
# get *all* boxes available on the API
b = osem_boxes()

@ -17,11 +17,12 @@ Boolean whether the deletion was successful
Purge cached responses from the given cache directory
}
\examples{
\donttest{
\dontrun{
osem_boxes(cache = tempdir())
osem_clear_cache()
cachedir = paste(getwd(), 'osemcache', sep = '/')
dir.create(file.path(cachedir), showWarnings = FALSE)
osem_boxes(cache = cachedir)
osem_clear_cache(cachedir)
}

@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/api.R
\name{osem_ensure_api_available}
\alias{osem_ensure_api_available}
\title{Check if the given openSenseMap API endpoint is available}
\usage{
osem_ensure_api_available(endpoint = osem_endpoint())
}
\arguments{
\item{endpoint}{The API base URL to check, defaulting to \code{\link{osem_endpoint}}}
}
\value{
\code{TRUE} if the API is available, otherwise \code{stop()} is called.
}
\description{
Check if the given openSenseMap API endpoint is available
}

@ -11,13 +11,31 @@ osem_measurements(x, ...)
\method{osem_measurements}{default}(x, ...)
\method{osem_measurements}{bbox}(x, phenomenon, exposure = NA,
from = NA, to = NA, columns = NA, ...,
endpoint = osem_endpoint(), progress = T, cache = NA)
\method{osem_measurements}{sensebox}(x, phenomenon, exposure = NA,
from = NA, to = NA, columns = NA, ...,
endpoint = osem_endpoint(), progress = T, cache = NA)
\method{osem_measurements}{bbox}(
x,
phenomenon,
exposure = NA,
from = NA,
to = NA,
columns = NA,
...,
endpoint = osem_endpoint(),
progress = T,
cache = NA
)
\method{osem_measurements}{sensebox}(
x,
phenomenon,
exposure = NA,
from = NA,
to = NA,
columns = NA,
...,
endpoint = osem_endpoint(),
progress = T,
cache = NA
)
}
\arguments{
\item{x}{Depending on the method, either
@ -58,15 +76,15 @@ a bounding box spanning the whole world.
}
\section{Methods (by class)}{
\itemize{
\item \code{default}: Get measurements from \strong{all} senseBoxes.
\item \code{osem_measurements(default)}: Get measurements from \strong{all} senseBoxes.
\item \code{bbox}: Get measurements by a spatial filter.
\item \code{osem_measurements(bbox)}: Get measurements by a spatial filter.
\item \code{sensebox}: Get measurements from a set of senseBoxes.
}}
\item \code{osem_measurements(sensebox)}: Get measurements from a set of senseBoxes.
}}
\examples{
\donttest{
\dontrun{
# get measurements from all boxes on the phenomenon 'PM10' from the last 48h
m = osem_measurements('PM10')
@ -97,6 +115,7 @@ a bounding box spanning the whole world.
# construct a bounding box 12km around berlin using the sf package,
# and get measurements from stations within that box
library(sf)
library(units)
bbox2 = st_point(c(13.4034, 52.5120)) \%>\%
st_sfc(crs = 4326) \%>\%
st_transform(3857) \%>\% # allow setting a buffer in meters

@ -7,8 +7,14 @@
\usage{
osem_measurements_archive(x, ...)
\method{osem_measurements_archive}{sensebox}(x, fromDate,
toDate = fromDate, sensorFilter = ~T, ..., progress = T)
\method{osem_measurements_archive}{sensebox}(
x,
fromDate,
toDate = fromDate,
sensorFilter = ~T,
...,
progress = T
)
}
\arguments{
\item{x}{A `sensebox data.frame` of a single box, as retrieved via \code{\link{osem_box}},
@ -30,7 +36,7 @@ A \code{tbl_df} containing observations of all selected sensors for each time st
\description{
This function is significantly faster than \code{\link{osem_measurements}} for large
time-frames, as daily CSV dumps for each sensor from
\href{http://archive.opensensemap.org}{archive.opensensemap.org} are used.
\href{https://archive.opensensemap.org}{archive.opensensemap.org} are used.
Note that the latest data available is from the previous day.
}
\details{
@ -42,15 +48,15 @@ but continue the remaining download.
}
\section{Methods (by class)}{
\itemize{
\item \code{sensebox}: Get daywise measurements for one or more sensors of a single box.
}}
\item \code{osem_measurements_archive(sensebox)}: Get daywise measurements for one or more sensors of a single box.
}}
\examples{
# fetch measurements for a single day
box = osem_box('593bcd656ccf3b0011791f5a')
m = osem_measurements_archive(box, as.POSIXlt('2018-09-13'))
\donttest{
# fetch measurements for a single day
box = osem_box('593bcd656ccf3b0011791f5a')
m = osem_measurements_archive(box, as.POSIXlt('2018-09-13'))
# fetch measurements for a date range and selected sensors
sensors = ~ phenomenon \%in\% c('Temperatur', 'Beleuchtungsstärke')
m = osem_measurements_archive(

@ -21,10 +21,10 @@ Get the counts of sensors for each observed phenomenon.
}
\section{Methods (by class)}{
\itemize{
\item \code{sensebox}: Get counts of sensors observing each phenomenon
\item \code{osem_phenomena(sensebox)}: Get counts of sensors observing each phenomenon
from a set of senseBoxes.
}}
}}
\examples{
# get the phenomena for a single senseBox
osem_phenomena(osem_box('593bcd656ccf3b0011791f5a'))

@ -50,7 +50,6 @@ test_that('optional box attributes are correctly parsed', {
expect_null(oldbox$description)
expect_null(oldbox$grouptag)
expect_null(oldbox$weblink)
expect_null(oldbox$updatedAt)
expect_null(oldbox$height)
expect_null(oldbox$lastMeasurement)
})

@ -8,7 +8,7 @@ test_that('a list of all boxes can be retrieved and returns a sensebox data.fram
expect_true(is.data.frame(boxes))
expect_true(is.factor(boxes$model))
expect_true(is.character(boxes$name))
expect_length(names(boxes), 15)
expect_length(names(boxes), 18)
expect_true(any('sensebox' %in% class(boxes)))
})
@ -53,10 +53,17 @@ test_that('a list of boxes with grouptype returns only boxes of that group', {
expect_true(all(boxes$grouptag == 'codeformuenster'))
})
test_that('a list of boxes within a bbox only returns boxes within that bbox', {
check_api()
boxes = osem_boxes(bbox = c(7.8, 51.8, 8.0, 52.0))
expect_true(all(boxes$lon > 7.8 & boxes$lon < 8.0 & boxes$lat > 51.8 & boxes$lat < 52.0))
})
test_that('endpoint can be (mis)configured', {
check_api()
expect_error(osem_boxes(endpoint = 'http://not.the.opensensemap.org'), 'resolve host')
expect_error(osem_boxes(endpoint = 'http://not.the.opensensemap.org'), 'The API at http://not.the.opensensemap.org is currently not available.')
})
test_that('a response with no matches returns empty sensebox data.frame', {

@ -10,7 +10,7 @@ test_that('measurements can be retrieved for a phenomenon', {
measurements = osem_measurements('Windgeschwindigkeit')
measurements = osem_measurements(x = 'Windgeschwindigkeit')
expect_true(tibble::is.tibble(measurements))
expect_true(tibble::is_tibble(measurements))
expect_true('osem_measurements' %in% class(measurements))
})
@ -104,7 +104,7 @@ test_that('both from and to are required when requesting measurements, error oth
test_that('phenomenon is required when requesting measurements, error otherwise', {
check_api()
expect_error(osem_measurements(), 'missing, with no default')
expect_error(osem_measurements())
expect_error(osem_measurements(boxes), 'Parameter "phenomenon" is required')
sfc = sf::st_sfc(sf::st_linestring(x = matrix(data = c(7, 8, 50, 51), ncol = 2)), crs = 4326)

@ -68,7 +68,7 @@ ggplot(exposure_counts, aes(x = createdAt, y = count, colour = exposure)) +
Outdoor boxes are growing *fast*!
We can also see the introduction of `mobile` sensor "stations" in 2017. While
mobile boxes are still few, we can expect a quick rise in 2018 once the new
[senseBox MCU with GPS support is released](https://sensebox.de/blog/2018-03-06-senseBox_MCU).
senseBox MCU with GPS support is released.
Let's have a quick summary:
```{r exposure_summary}
@ -93,7 +93,7 @@ inconsistent (`Luftdaten`, `luftdaten.info`, ...)
grouptag_counts = boxes %>%
group_by(grouptag) %>%
# only include grouptags with 8 or more members
filter(length(grouptag) >= 8 && !is.na(grouptag)) %>%
filter(length(grouptag) >= 8 & !is.na(grouptag)) %>%
mutate(count = row_number(createdAt))
# helper for sorting the grouptags by boxcount
@ -163,7 +163,7 @@ ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
We see a sudden rise in early 2017, which lines up with the fast growing grouptag `Luftdaten`.
This was enabled by an integration of openSenseMap.org into the firmware of the
air quality monitoring project [luftdaten.info](https://luftdaten.info).
air quality monitoring project [luftdaten.info](https://sensor.community/de/).
The dips in mid 2017 and early 2018 could possibly be explained by production/delivery issues
of the senseBox hardware, but I have no data on the exact time frames to verify.
@ -192,7 +192,7 @@ spanning a large chunk of openSenseMap's existence.
duration = boxes %>%
group_by(grouptag) %>%
# only include grouptags with 8 or more members
filter(length(grouptag) >= 8 && !is.na(grouptag) && !is.na(updatedAt)) %>%
filter(length(grouptag) >= 8 & !is.na(grouptag) & !is.na(updatedAt)) %>%
mutate(duration = difftime(updatedAt, createdAt, units='days'))
ggplot(duration, aes(x = grouptag, y = duration)) +

@ -0,0 +1,300 @@
---
title: "Visualising the Develpment of openSenseMap.org in 2022"
author: "Jan Stenkamp"
date: '`r Sys.Date()`'
output:
html_document:
code_folding: hide
df_print: kable
theme: lumen
toc: yes
toc_float: yes
rmarkdown::html_vignette:
df_print: kable
fig_height: 5
fig_width: 7
toc: yes
vignette: >
%\VignetteIndexEntry{Visualising the Develpment of openSenseMap.org in 2022}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
---
> This vignette serves as an example on data wrangling & visualization with
`opensensmapr`, `dplyr` and `ggplot2`.
```{r setup, results='hide', message=FALSE, warning=FALSE}
# required packages:
library(opensensmapr) # data download
library(dplyr) # data wrangling
library(ggplot2) # plotting
library(lubridate) # date arithmetic
library(zoo) # rollmean()
```
openSenseMap.org has grown quite a bit in the last years; it would be interesting
to see how we got to the current `r osem_counts()$boxes` sensor stations,
split up by various attributes of the boxes.
While `opensensmapr` provides extensive methods of filtering boxes by attributes
on the server, we do the filtering within R to save time and gain flexibility.
So the first step is to retrieve *all the boxes*.
```{r download, results='hide', message=FALSE, warning=FALSE}
# if you want to see results for a specific subset of boxes,
# just specify a filter such as grouptag='ifgi' here
boxes_all = osem_boxes()
boxes = boxes_all
```
# Introduction
In the following we just want to have a look at the boxes created in 2022, so we filter for them.
```{r}
boxes = filter(boxes, locationtimestamp >= "2022-01-01" & locationtimestamp <="2022-12-31")
summary(boxes) -> summary.data.frame
```
<!-- This gives a good overview already: As of writing this, there are more than 11,000 -->
<!-- sensor stations, of which ~30% are currently running. Most of them are placed -->
<!-- outdoors and have around 5 sensors each. -->
<!-- The oldest station is from August 2016, while the latest station was registered a -->
<!-- couple of minutes ago. -->
Another feature of interest is the spatial distribution of the boxes: `plot()`
can help us out here. This function requires a bunch of optional dependencies though.
```{r message=F, warning=F}
if (!require('maps')) install.packages('maps')
if (!require('maptools')) install.packages('maptools')
if (!require('rgeos')) install.packages('rgeos')
plot(boxes)
```
But what do these sensor stations actually measure? Lets find out.
`osem_phenomena()` gives us a named list of of the counts of each observed
phenomenon for the given set of sensor stations:
```{r}
phenoms = osem_phenomena(boxes)
str(phenoms)
```
Thats quite some noise there, with many phenomena being measured by a single
sensor only, or many duplicated phenomena due to slightly different spellings.
We should clean that up, but for now let's just filter out the noise and find
those phenomena with high sensor numbers:
```{r}
phenoms[phenoms > 50]
```
# Plot count of boxes by time {.tabset}
By looking at the `createdAt` attribute of each box we know the exact time a box
was registered. Because of some database migration issues the `createdAt` values are mostly wrong (~80% of boxes created 2022-03-30), so we are using the `timestamp` attribute of the `currentlocation` which should in most cases correspond to the creation date.
With this approach we have no information about boxes that were deleted in the
meantime, but that's okay for now.
## ...and exposure
```{r exposure_counts, message=FALSE}
exposure_counts = boxes %>%
group_by(exposure) %>%
mutate(count = row_number(locationtimestamp))
exposure_colors = c(indoor = 'red', outdoor = 'lightgreen', mobile = 'blue', unknown = 'darkgrey')
ggplot(exposure_counts, aes(x = locationtimestamp, y = count, colour = exposure)) +
geom_line() +
scale_colour_manual(values = exposure_colors) +
xlab('Registration Date') + ylab('senseBox count')
```
Outdoor boxes are growing *fast*!
We can also see the introduction of `mobile` sensor "stations" in 2017.
Let's have a quick summary:
```{r exposure_summary}
exposure_counts %>%
summarise(
oldest = min(locationtimestamp),
newest = max(locationtimestamp),
count = max(count)
) %>%
arrange(desc(count))
```
## ...and grouptag
We can try to find out where the increases in growth came from, by analysing the
box count by grouptag.
Caveats: Only a small subset of boxes has a grouptag, and we should assume
that these groups are actually bigger. Also, we can see that grouptag naming is
inconsistent (`Luftdaten`, `luftdaten.info`, ...)
```{r grouptag_counts, message=FALSE}
grouptag_counts = boxes %>%
group_by(grouptag) %>%
# only include grouptags with 15 or more members
filter(length(grouptag) >= 15 & !is.na(grouptag) & grouptag != '') %>%
mutate(count = row_number(locationtimestamp))
# helper for sorting the grouptags by boxcount
sortLvls = function(oldFactor, ascending = TRUE) {
lvls = table(oldFactor) %>% sort(., decreasing = !ascending) %>% names()
factor(oldFactor, levels = lvls)
}
grouptag_counts$grouptag = sortLvls(grouptag_counts$grouptag, ascending = FALSE)
ggplot(grouptag_counts, aes(x = locationtimestamp, y = count, colour = grouptag)) +
geom_line(aes(group = grouptag)) +
xlab('Registration Date') + ylab('senseBox count')
```
```{r grouptag_summary}
grouptag_counts %>%
summarise(
oldest = min(locationtimestamp),
newest = max(locationtimestamp),
count = max(count)
) %>%
arrange(desc(count))
```
# Plot rate of growth and inactivity per week
First we group the boxes by `locationtimestamp` into bins of one week:
```{r growthrate_registered, warning=FALSE, message=FALSE, results='hide'}
bins = 'week'
mvavg_bins = 6
growth = boxes %>%
mutate(week = cut(as.Date(locationtimestamp), breaks = bins)) %>%
group_by(week) %>%
summarize(count = length(week)) %>%
mutate(event = 'registered')
```
We can do the same for `updatedAt`, which informs us about the last change to
a box, including uploaded measurements. As a lot of boxes were "updated" by the database
migration, many of them are updated at 2022-03-30, so we try to use the `lastMeasurement`
attribute instead of `updatedAt`. This leads to fewer boxes but also automatically excludes
boxes which were created but never made a measurement.
This method of determining inactive boxes is fairly inaccurate and should be
considered an approximation, because we have no information about intermediate
inactive phases.
Also deleted boxes would probably have a big impact here.
```{r growthrate_inactive, warning=FALSE, message=FALSE, results='hide'}
inactive = boxes %>%
# remove boxes that were updated in the last two days,
# b/c any box becomes inactive at some point by definition of updatedAt
filter(lastMeasurement < now() - days(2)) %>%
mutate(week = cut(as.Date(lastMeasurement), breaks = bins)) %>%
filter(as.Date(week) > as.Date("2021-12-31")) %>%
group_by(week) %>%
summarize(count = length(week)) %>%
mutate(event = 'inactive')
```
Now we can combine both datasets for plotting:
```{r growthrate, warning=FALSE, message=FALSE, results='hide'}
boxes_by_date = bind_rows(growth, inactive) %>% group_by(event)
ggplot(boxes_by_date, aes(x = as.Date(week), colour = event)) +
xlab('Time') + ylab(paste('rate per ', bins)) +
scale_x_date(date_breaks="years", date_labels="%Y") +
scale_colour_manual(values = c(registered = 'lightgreen', inactive = 'grey')) +
geom_point(aes(y = count), size = 0.5) +
# moving average, make first and last value NA (to ensure identical length of vectors)
geom_line(aes(y = rollmean(count, mvavg_bins, fill = list(NA, NULL, NA))))
```
And see in which weeks the most boxes become (in)active:
```{r table_mostregistrations}
boxes_by_date %>%
filter(count > 50) %>%
arrange(desc(count))
```
# Plot duration of boxes being active {.tabset}
While we are looking at `locationtimestamp` and `lastMeasurement`, we can also extract the duration of activity
of each box, and look at metrics by exposure and grouptag once more:
## ...by exposure
```{r exposure_duration, message=FALSE}
durations = boxes %>%
group_by(exposure) %>%
filter(!is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(durations, aes(x = exposure, y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days')
```
The time of activity averages at only `r round(mean(durations$duration))` days,
though there are boxes with `r round(max(durations$duration))` days of activity,
spanning a large chunk of openSenseMap's existence.
## ...by grouptag
```{r grouptag_duration, message=FALSE}
durations = boxes %>%
filter(!is.na(lastMeasurement)) %>%
group_by(grouptag) %>%
# only include grouptags with 20 or more members
filter(length(grouptag) >= 15 & !is.na(grouptag) & !is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(durations, aes(x = grouptag, y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days')
durations %>%
summarize(
duration_avg = round(mean(duration)),
duration_min = round(min(duration)),
duration_max = round(max(duration)),
oldest_box = round(max(difftime(now(), locationtimestamp, units='days')))
) %>%
arrange(desc(duration_avg))
```
The time of activity averages at only `r round(mean(durations$duration))` days,
though there are boxes with `r round(max(durations$duration))` days of activity,
spanning a large chunk of openSenseMap's existence.
## ...by year of registration
This is less useful, as older boxes are active for a longer time by definition.
If you have an idea how to compensate for that, please send a [Pull Request][PR]!
```{r year_duration, message=FALSE}
# NOTE: boxes older than 2016 missing due to missing updatedAt in database
duration = boxes %>%
mutate(year = cut(as.Date(locationtimestamp), breaks = 'year')) %>%
group_by(year) %>%
filter(!is.na(lastMeasurement)) %>%
mutate(duration = difftime(lastMeasurement, locationtimestamp, units='days')) %>%
filter(duration >= 0)
ggplot(duration, aes(x = substr(as.character(year), 0, 4), y = duration)) +
geom_boxplot() +
coord_flip() + ylab('Duration active in Days') + xlab('Year of Registration')
```
# More Visualisations
Other visualisations come to mind, and are left as an exercise to the reader.
If you implemented some, feel free to add them to this vignette via a [Pull Request][PR].
* growth by phenomenon
* growth by location -> (interactive) map
* set inactive rate in relation to total box count
* filter timespans with big dips in growth rate, and extrapolate the amount of
senseBoxes that could be on the platform today, assuming there were no production issues ;)
[PR]: https://github.com/sensebox/opensensmapr/pulls
Loading…
Cancel
Save