From 8b4ec6295db2b92c0a91db916020a0d218edf9c6 Mon Sep 17 00:00:00 2001 From: noerw Date: Thu, 24 Aug 2017 21:45:38 +0200 Subject: [PATCH] update vignette build --- README.md | 8 ++-- inst/doc/osem-intro.R | 17 ++++++-- inst/doc/osem-intro.Rmd | 40 +++++++++++------- inst/doc/osem-intro.html | 89 +++++++++++++++++++++------------------- 4 files changed, 89 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 799fae5..3aec00e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This R package ingests data (environmental measurements, sensor stations) from the API of opensensemap.org for analysis in R. The package aims to be compatible with sf and the tidyverse. -> **Whats up with that package name?** idk, the R people seem to [enjoy][1] +> *Whats up with that package name?* idk, the R people seem to [enjoy][1] [dropping][2] [vovels][3] so.. Unfortunately I couldn't fit the naming convention to drop an `y` in there. @@ -21,7 +21,7 @@ devtools::install_github('noerw/opensensmapr') ``` ## Usage -A usage example is shown in the vignette [`osem-intro`](vignettes/osem-intro.Rmd). +A usage example is shown in the vignette [`osem-intro`](https://noerw.github.com/opensensmapR/inst/doc/osem-intro.html). In general these are the main functions for data retrieval: ```r @@ -43,9 +43,9 @@ m = osem_measurements(bbox, phenomenon, filter1, ...) osem_counts() ``` -Additionally there are some helpers: `summary.sensebox(), plot.sensebox(), osem_as_sf()...`. +Additionally there are some helpers: `summary.sensebox(), plot.sensebox(), st_as_sf.sensebox(), [.sensebox(), filter.sensebox(), mutate.sensebox(), ...`. -For parameter options, open each functions' documentation by calling `?`. +For parameter usage, open each functions' documentation by calling `?`. ## License GPL-2.0 - Norwin Roosen diff --git a/inst/doc/osem-intro.R b/inst/doc/osem-intro.R index e51d194..575f473 100644 --- a/inst/doc/osem-intro.R +++ b/inst/doc/osem-intro.R @@ -39,12 +39,13 @@ plot(pm25_sensors) library(sf) library(units) library(lubridate) +library(dplyr) # construct a bounding box: 12 kilometers around Berlin berlin = st_point(c(13.4034, 52.5120)) %>% st_sfc(crs = 4326) %>% st_transform(3857) %>% # allow setting a buffer in meters - st_buffer(units::set_units(12, km)) %>% + st_buffer(set_units(12, km)) %>% st_transform(4326) %>% # the opensensemap expects WGS 84 st_bbox() @@ -52,13 +53,21 @@ berlin = st_point(c(13.4034, 52.5120)) %>% pm25 = osem_measurements( berlin, phenomenon = 'PM2.5', - from = now() - days(7), # defaults to 2 days + from = now() - days(20), # defaults to 2 days to = now() ) plot(pm25) ## ------------------------------------------------------------------------ -pm25_sf = osem_as_sf(pm25) -plot(st_geometry(pm25_sf), axes = T) +outliers = filter(pm25, value > 100)$sensorId +bad_sensors = outliers[, drop = T] %>% levels() + +pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) + +## ------------------------------------------------------------------------ +st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) + +## ------------------------------------------------------------------------ +pm25 %>% filter(invalid == FALSE) %>% plot() diff --git a/inst/doc/osem-intro.Rmd b/inst/doc/osem-intro.Rmd index 5d6126a..9906138 100644 --- a/inst/doc/osem-intro.Rmd +++ b/inst/doc/osem-intro.Rmd @@ -26,11 +26,6 @@ Its main goals are to provide means for: - big data analysis of the measurements stored on the platform - sensor metadata analysis (sensor counts, spatial distribution, temporal trends) -> *Please note:* The openSenseMap API is sometimes a bit unstable when streaming -long responses, which results in `curl` complaining about `Unexpected EOF`. This -bug is being worked on upstream. Meanwhile you have to retry the request when -this occurs. - ### Exploring the dataset Before we look at actual observations, lets get a grasp of the openSenseMap datasets' structure. @@ -45,14 +40,14 @@ all_sensors = osem_boxes() summary(all_sensors) ``` -This gives a good overview already: As of writing this, there are more than 600 +This gives a good overview already: As of writing this, there are more than 700 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago. -Another feature of interest is the spatial distribution of the boxes. `plot()` -can help us out here. This function requires a bunch of optional dependcies though. +Another feature of interest is the spatial distribution of the boxes: `plot()` +can help us out here. This function requires a bunch of optional dependencies though. ```{r message=F, warning=F} if (!require('maps')) install.packages('maps') @@ -112,12 +107,13 @@ Luckily we can get the measurements filtered by a bounding box: library(sf) library(units) library(lubridate) +library(dplyr) # construct a bounding box: 12 kilometers around Berlin berlin = st_point(c(13.4034, 52.5120)) %>% st_sfc(crs = 4326) %>% st_transform(3857) %>% # allow setting a buffer in meters - st_buffer(units::set_units(12, km)) %>% + st_buffer(set_units(12, km)) %>% st_transform(4326) %>% # the opensensemap expects WGS 84 st_bbox() ``` @@ -125,19 +121,33 @@ berlin = st_point(c(13.4034, 52.5120)) %>% pm25 = osem_measurements( berlin, phenomenon = 'PM2.5', - from = now() - days(7), # defaults to 2 days + from = now() - days(20), # defaults to 2 days to = now() ) plot(pm25) ``` -Now we can get started with actual spatiotemporal data analysis. First plot the -measuring locations: +Now we can get started with actual spatiotemporal data analysis. +First, lets mask the seemingly uncalibrated sensors: + +```{r} +outliers = filter(pm25, value > 100)$sensorId +bad_sensors = outliers[, drop = T] %>% levels() + +pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors) +``` + +Then plot the measuring locations, flagging the outliers: + +```{r} +st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T) +``` + +Removing these sensors yields a nicer time series plot: ```{r} -pm25_sf = osem_as_sf(pm25) -plot(st_geometry(pm25_sf), axes = T) +pm25 %>% filter(invalid == FALSE) %>% plot() ``` -further analysis: `TODO` +Further analysis: comparison with LANUV data `TODO` diff --git a/inst/doc/osem-intro.html b/inst/doc/osem-intro.html index f0f4286..31bf7e2 100644 --- a/inst/doc/osem-intro.html +++ b/inst/doc/osem-intro.html @@ -12,7 +12,7 @@ - + Analyzing environmental sensor data from openSenseMap.org in R @@ -70,7 +70,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf

Analyzing environmental sensor data from openSenseMap.org in R

Norwin Roosen

-

2017-08-23

+

2017-08-24

@@ -81,9 +81,6 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
  • big data analysis of the measurements stored on the platform
  • sensor metadata analysis (sensor counts, spatial distribution, temporal trends)
  • -
    -

    Please note: The openSenseMap API is sometimes a bit unstable when streaming long responses, which results in curl complaining about Unexpected EOF. This bug is being worked on upstream. Meanwhile you have to retry the request when this occurs.

    -

    Exploring the dataset

    Before we look at actual observations, lets get a grasp of the openSenseMap datasets’ structure.

    @@ -92,11 +89,11 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf all_sensors = osem_boxes()
    summary(all_sensors)
    -
    ## box total: 701
    +
    ## boxes total: 704
     ## 
     ## boxes by exposure:
     ##  indoor outdoor unknown 
    -##     127     553      21 
    +##     127     556      21 
     ## 
     ## boxes by model:
     ##                  custom            homeEthernet   homeEthernetFeinstaub 
    @@ -104,38 +101,38 @@ all_sensors = osem_boxes()
    -

    This gives a good overview already: As of writing this, there are more than 600 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago.

    -

    Another feature of interest is the spatial distribution of the boxes. plot() can help us out here. This function requires a bunch of optional dependcies though.

    +## 1.000 4.000 5.000 4.605 5.000 17.000
    +

    This gives a good overview already: As of writing this, there are more than 700 sensor stations, of which ~50% are currently running. Most of them are placed outdoors and have around 5 sensors each. The oldest station is from May 2014, while the latest station was registered a couple of minutes ago.

    +

    Another feature of interest is the spatial distribution of the boxes: plot() can help us out here. This function requires a bunch of optional dependencies though.

    if (!require('maps'))     install.packages('maps')
     if (!require('maptools')) install.packages('maptools')
     if (!require('rgeos'))    install.packages('rgeos')
     
     plot(all_sensors)
    -

    +

    It seems we have to reduce our area of interest to Germany.

    But what do these sensor stations actually measure? Lets find out. osem_phenomena() gives us a named list of of the counts of each observed phenomenon for the given set of sensor stations:

    phenoms = osem_phenomena(all_sensors)
     str(phenoms)
    -
    ## List of 191
    -##  $ Temperatur                              : int 644
    -##  $ rel. Luftfeuchte                        : int 531
    -##  $ Luftdruck                               : int 367
    -##  $ PM10                                    : int 344
    -##  $ PM2.5                                   : int 344
    +
    ## List of 189
    +##  $ Temperatur                              : int 647
    +##  $ rel. Luftfeuchte                        : int 534
    +##  $ Luftdruck                               : int 368
    +##  $ PM10                                    : int 347
    +##  $ PM2.5                                   : int 347
     ##  $ UV-Intensität                           : int 255
     ##  $ Beleuchtungsstärke                      : int 251
     ##  $ Luftfeuchtigkeit                        : int 83
    @@ -234,19 +231,19 @@ if (!require('rgeos'))    Thats quite some noise there, with many phenomena being measured by a single sensor only, or many duplicated phenomena due to slightly different spellings. We should clean that up, but for now let’s just filter out the noise and find those phenomena with high sensor numbers:

    phenoms[phenoms > 20]
    ## $Temperatur
    -## [1] 644
    +## [1] 647
     ## 
     ## $`rel. Luftfeuchte`
    -## [1] 531
    +## [1] 534
     ## 
     ## $Luftdruck
    -## [1] 367
    +## [1] 368
     ## 
     ## $PM10
    -## [1] 344
    +## [1] 347
     ## 
     ## $PM2.5
    -## [1] 344
    +## [1] 347
     ## 
     ## $`UV-Intensität`
     ## [1] 255
    @@ -266,32 +263,32 @@ if (!require('rgeos'))    phenomenon = 'PM2.5'
     )
    summary(pm25_sensors)
    -
    ## box total: 236
    +
    ## boxes total: 240
     ## 
     ## boxes by exposure:
     ## outdoor 
    -##     236 
    +##     240 
     ## 
     ## boxes by model:
     ##                  custom   homeEthernetFeinstaub                homeWifi 
     ##                      18                       4                       5 
     ##       homeWifiFeinstaub        luftdaten_sds011 luftdaten_sds011_bme280 
    -##                      12                      15                      29 
    +##                      12                      14                      29 
     ## luftdaten_sds011_bmp180  luftdaten_sds011_dht11  luftdaten_sds011_dht22 
    -##                       1                      11                     141 
    +##                       1                      11                     146 
     ## 
     ## $last_measurement_within
     ##    1h    1d   30d  365d never 
    -##   230   233   234   234     2 
    +##     0     0     0     0   240 
     ## 
     ## oldest box: 2016-09-11 08:17:17 (Balkon Gasselstiege)
    -## newest box: 2017-08-23 08:44:14 (Messstation Steinheim am Albuch)
    +## newest box: 2017-08-24 17:38:44 (Burgweinting)
     ## 
     ## sensors per box:
     ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    -##   2.000   4.000   4.000   4.271   4.000  10.000
    +## 2.000 4.000 4.000 4.275 4.000 10.000
    plot(pm25_sensors)
    -

    +

    Thats still more than 200 measuring stations, we can work with that.

    @@ -300,28 +297,36 @@ if (!require('rgeos'))
    library(sf)
     library(units)
     library(lubridate)
    +library(dplyr)
     
     # construct a bounding box: 12 kilometers around Berlin
     berlin = st_point(c(13.4034, 52.5120)) %>%
       st_sfc(crs = 4326) %>%
       st_transform(3857) %>% # allow setting a buffer in meters
    -  st_buffer(units::set_units(12, km)) %>%
    +  st_buffer(set_units(12, km)) %>%
       st_transform(4326) %>% # the opensensemap expects WGS 84
       st_bbox()
    pm25 = osem_measurements(
       berlin,
       phenomenon = 'PM2.5',
    -  from = now() - days(7), # defaults to 2 days
    +  from = now() - days(20), # defaults to 2 days
       to = now()
     )
     
     plot(pm25)
    -

    -

    Now we can get started with actual spatiotemporal data analysis. First plot the measuring locations:

    -
    pm25_sf = osem_as_sf(pm25)
    -plot(st_geometry(pm25_sf), axes = T)
    -

    -

    further analysis: TODO

    +

    +

    Now we can get started with actual spatiotemporal data analysis. First, lets mask the seemingly uncalibrated sensors:

    +
    outliers = filter(pm25, value > 100)$sensorId
    +bad_sensors = outliers[, drop = T] %>% levels()
    +
    +pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
    +

    Then plot the measuring locations, flagging the outliers:

    +
    st_as_sf(pm25) %>% st_geometry() %>% plot(col = factor(pm25$invalid), axes = T)
    +

    +

    Removing these sensors yields a nicer time series plot:

    +
    pm25 %>% filter(invalid == FALSE) %>% plot()
    +

    +

    Further analysis: comparison with LANUV data TODO