You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
opensensmapR/inst/doc/osem-intro.html

588 lines
403 KiB
HTML

<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>Analyzing environmental sensor data from openSenseMap.org in R</title>
<script type="text/javascript">
window.onload = function() {
var imgs = document.getElementsByTagName('img'), i, img;
for (i = 0; i < imgs.length; i++) {
img = imgs[i];
// center an image if it is the only element of its parent
if (img.parentElement.childElementCount === 1)
img.parentElement.style.textAlign = 'center';
}
};
</script>
<!-- Styles for R syntax highlighter -->
<style type="text/css">
pre .operator,
pre .paren {
color: rgb(104, 118, 135)
}
pre .literal {
color: #990073
}
pre .number {
color: #099;
}
pre .comment {
color: #998;
font-style: italic
}
pre .keyword {
color: #900;
font-weight: bold
}
pre .identifier {
color: rgb(0, 0, 0);
}
pre .string {
color: #d14;
}
</style>
<!-- R syntax highlighter -->
<script type="text/javascript">
var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.
hljs.initHighlightingOnLoad();
</script>
<style type="text/css">
body, td {
font-family: sans-serif;
background-color: white;
font-size: 13px;
}
body {
max-width: 800px;
margin: auto;
padding: 1em;
line-height: 20px;
}
tt, code, pre {
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
}
h1 {
font-size:2.2em;
}
h2 {
font-size:1.8em;
}
h3 {
font-size:1.4em;
}
h4 {
font-size:1.0em;
}
h5 {
font-size:0.9em;
}
h6 {
font-size:0.8em;
}
a:visited {
color: rgb(50%, 0%, 50%);
}
pre, img {
max-width: 100%;
}
pre {
overflow-x: auto;
}
pre code {
display: block; padding: 0.5em;
}
code {
font-size: 92%;
border: 1px solid #ccc;
}
code[class] {
background-color: #F8F8F8;
}
table, td, th {
border: none;
}
blockquote {
color:#666666;
margin:0;
padding-left: 1em;
border-left: 0.5em #EEE solid;
}
hr {
height: 0px;
border-bottom: none;
border-top-width: thin;
border-top-style: dotted;
border-top-color: #999999;
}
@media print {
* {
background: transparent !important;
color: black !important;
filter:none !important;
-ms-filter: none !important;
}
body {
font-size:12pt;
max-width:100%;
}
a, a:visited {
text-decoration: underline;
}
hr {
visibility: hidden;
page-break-before: always;
}
pre, blockquote {
padding-right: 1em;
page-break-inside: avoid;
}
tr, img {
page-break-inside: avoid;
}
img {
max-width: 100% !important;
}
@page :left {
margin: 15mm 20mm 15mm 10mm;
}
@page :right {
margin: 15mm 10mm 15mm 20mm;
}
p, h2, h3 {
orphans: 3; widows: 3;
}
h2, h3 {
page-break-after: avoid;
}
}
</style>
</head>
<body>
<h2>Analyzing environmental sensor data from openSenseMap.org in R</h2>
<p>This package provides data ingestion functions for almost any data stored on the
open data platform for environemental sensordata <a href="https://opensensemap.org">https://opensensemap.org</a>.
Its main goals are to provide means for:</p>
<ul>
<li>big data analysis of the measurements stored on the platform</li>
<li>sensor metadata analysis (sensor counts, spatial distribution, temporal trends)</li>
</ul>
<h3>Exploring the dataset</h3>
<p>Before we look at actual observations, lets get a grasp of the openSenseMap
datasets&#39; structure.</p>
<pre><code class="r">library(magrittr)
library(opensensmapr)
all_sensors = osem_boxes()
</code></pre>
<pre><code class="r">summary(all_sensors)
</code></pre>
<pre><code>## boxes total: 1779
##
## boxes by exposure:
## indoor mobile outdoor unknown
## 288 55 1416 20
##
## boxes by model:
## custom homeEthernet homeEthernetFeinstaub
## 335 92 49
## homeWifi homeWifiFeinstaub luftdaten_pms1003
## 192 144 1
## luftdaten_pms1003_bme280 luftdaten_pms5003_bme280 luftdaten_pms7003_bme280
## 1 5 2
## luftdaten_sds011 luftdaten_sds011_bme280 luftdaten_sds011_bmp180
## 57 197 19
## luftdaten_sds011_dht11 luftdaten_sds011_dht22
## 46 639
##
## $last_measurement_within
## 1h 1d 30d 365d never
## 921 960 1089 1427 235
##
## oldest box: 2014-05-28 15:36:14 (CALIMERO)
## newest box: 2018-05-24 20:29:50 (Stadthalle)
##
## sensors per box:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.601 5.000 33.000
</code></pre>
<p>This gives a good overview already: As of writing this, there are more than 700
sensor stations, of which ~50% are currently running. Most of them are placed
outdoors and have around 5 sensors each.
The oldest station is from May 2014, while the latest station was registered a
couple of minutes ago.</p>
<p>Another feature of interest is the spatial distribution of the boxes: <code>plot()</code>
can help us out here. This function requires a bunch of optional dependencies though.</p>
<pre><code class="r">if (!require(&#39;maps&#39;)) install.packages(&#39;maps&#39;)
if (!require(&#39;maptools&#39;)) install.packages(&#39;maptools&#39;)
if (!require(&#39;rgeos&#39;)) install.packages(&#39;rgeos&#39;)
plot(all_sensors)
</code></pre>
<p><img src="
<p>It seems we have to reduce our area of interest to Germany.</p>
<p>But what do these sensor stations actually measure? Lets find out.
<code>osem_phenomena()</code> gives us a named list of of the counts of each observed
phenomenon for the given set of sensor stations:</p>
<pre><code class="r">phenoms = osem_phenomena(all_sensors)
str(phenoms)
</code></pre>
<pre><code>## List of 432
## $ Temperatur : int 1607
## $ rel. Luftfeuchte : int 1421
## $ PM10 : int 1200
## $ PM2.5 : int 1198
## $ Luftdruck : int 824
## $ Beleuchtungsstärke : int 480
## $ UV-Intensität : int 471
## $ Luftfeuchtigkeit : int 84
## $ Temperature : int 49
## $ Humidity : int 42
## $ Helligkeit : int 25
## $ Lautstärke : int 21
## $ Schall : int 20
## $ UV : int 20
## $ Pressure : int 19
## $ Licht : int 18
## $ Luftfeuchte : int 14
## $ Umgebungslautstärke : int 14
## $ Lämpötila : int 13
## $ Ilmanpaine : int 12
## $ Signal : int 12
## $ Feinstaub PM10 : int 10
## $ Feinstaub PM2.5 : int 9
## $ Kosteus : int 8
## $ Valonmäärä : int 8
## $ temperature : int 8
## $ PM01 : int 7
## $ Temperatur DHT22 : int 7
## $ UV-säteily : int 7
## $ Niederschlag : int 6
## $ UV-Strahlung : int 6
## $ Wind speed : int 6
## $ Windgeschwindigkeit : int 6
## $ humidity : int 6
## $ Ilmankosteus : int 5
## $ Wassertemperatur : int 5
## $ Windrichtung : int 5
## $ rel. Luftfeuchtigkeit : int 5
## $ Druck : int 4
## $ Light : int 4
## $ Temperature 1 : int 4
## $ UV Index : int 4
## $ UV-Säteily : int 4
## $ lautstärke : int 4
## $ rel. Luftfeuchte 1 : int 4
## $ relative Luftfeuchtigkeit : int 4
## $ Air pressure : int 3
## $ Batterie : int 3
## $ Battery : int 3
## $ DS18B20_Probe01 : int 3
## $ DS18B20_Probe02 : int 3
## $ DS18B20_Probe03 : int 3
## $ DS18B20_Probe04 : int 3
## $ DS18B20_Probe05 : int 3
## $ Licht (digital) : int 3
## $ Luftdruck (BME280) : int 3
## $ PM 10 : int 3
## $ PM 2.5 : int 3
## $ Temp : int 3
## $ Temperatur (BME280) : int 3
## $ Temperatur HDC1008 : int 3
## $ Temperatura : int 3
## $ Temperature 2 : int 3
## $ UV-Index : int 3
## $ Valoisuus : int 3
## $ Wind Gust : int 3
## $ pressure : int 3
## $ rel. Luftfeuchte DHT22 : int 3
## $ 1 : int 2
## $ 10 : int 2
## $ 2 : int 2
## $ 3 : int 2
## $ 4 : int 2
## $ 5 : int 2
## $ 6 : int 2
## $ 7 : int 2
## $ 8 : int 2
## $ 9 : int 2
## $ Air Pressure : int 2
## $ Anderer : int 2
## $ Battery voltage : int 2
## $ CO2 : int 2
## $ Feuchte : int 2
## $ Illuminance : int 2
## $ Intensity : int 2
## $ Leitfähigkeit : int 2
## $ Lichtintensität : int 2
## $ Luftdruck BMP180 : int 2
## $ Luftfeuchte (BME280) : int 2
## $ Luftqualität : int 2
## $ Lufttemperatur : int 2
## $ PM25 : int 2
## $ Radioactivity : int 2
## $ Radioaktivität : int 2
## $ Regen : int 2
## $ Relative Humidity : int 2
## $ Sound : int 2
## $ Temperatur (DHT22) : int 2
## $ Temperatur BMP180 : int 2
## [list output truncated]
</code></pre>
<p>Thats quite some noise there, with many phenomena being measured by a single
sensor only, or many duplicated phenomena due to slightly different spellings.
We should clean that up, but for now let&#39;s just filter out the noise and find
those phenomena with high sensor numbers:</p>
<pre><code class="r">phenoms[phenoms &gt; 20]
</code></pre>
<pre><code>## $Temperatur
## [1] 1607
##
## $`rel. Luftfeuchte`
## [1] 1421
##
## $PM10
## [1] 1200
##
## $PM2.5
## [1] 1198
##
## $Luftdruck
## [1] 824
##
## $Beleuchtungsstärke
## [1] 480
##
## $`UV-Intensität`
## [1] 471
##
## $Luftfeuchtigkeit
## [1] 84
##
## $Temperature
## [1] 49
##
## $Humidity
## [1] 42
##
## $Helligkeit
## [1] 25
##
## $Lautstärke
## [1] 21
</code></pre>
<p>Alright, temperature it is! Fine particulate matter (PM2.5) seems to be more
interesting to analyze though.
We should check how many sensor stations provide useful data: We want only those
boxes with a PM2.5 sensor, that are placed outdoors and are currently submitting
measurements:</p>
<pre><code class="r">pm25_sensors = osem_boxes(
exposure = &#39;outdoor&#39;,
date = Sys.time(), # ±4 hours
phenomenon = &#39;PM2.5&#39;
)
</code></pre>
<pre><code class="r">summary(pm25_sensors)
</code></pre>
<pre><code>## boxes total: 788
##
## boxes by exposure:
## outdoor
## 788
##
## boxes by model:
## custom homeEthernetFeinstaub homeWifi
## 28 37 6
## homeWifiFeinstaub luftdaten_pms1003_bme280 luftdaten_pms5003_bme280
## 57 1 2
## luftdaten_pms7003_bme280 luftdaten_sds011 luftdaten_sds011_bme280
## 2 33 135
## luftdaten_sds011_bmp180 luftdaten_sds011_dht11 luftdaten_sds011_dht22
## 14 31 442
##
## $last_measurement_within
## 1h 1d 30d 365d never
## 764 777 780 785 3
##
## oldest box: 2016-06-02 12:09:47 (BalkonBox Mindener Str.)
## newest box: 2018-05-24 20:29:50 (Stadthalle)
##
## sensors per box:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 4.000 4.615 5.000 12.000
</code></pre>
<pre><code class="r">plot(pm25_sensors)
</code></pre>
<p><img src="
<p>Thats still more than 200 measuring stations, we can work with that.</p>
<h3>Analyzing sensor data</h3>
<p>Having analyzed the available data sources, let&#39;s finally get some measurements.
We could call <code>osem_measurements(pm25_sensors)</code> now, however we are focussing on
a restricted area of interest, the city of Berlin.
Luckily we can get the measurements filtered by a bounding box:</p>
<pre><code class="r">library(sf)
</code></pre>
<pre><code>## Linking to GEOS 3.6.1, GDAL 2.1.4, proj.4 4.9.3
</code></pre>
<pre><code class="r">library(units)
library(lubridate)
</code></pre>
<pre><code>##
## Attaching package: &#39;lubridate&#39;
</code></pre>
<pre><code>## The following object is masked from &#39;package:base&#39;:
##
## date
</code></pre>
<pre><code class="r">library(dplyr)
</code></pre>
<pre><code>##
## Attaching package: &#39;dplyr&#39;
</code></pre>
<pre><code>## The following objects are masked from &#39;package:lubridate&#39;:
##
## intersect, setdiff, union
</code></pre>
<pre><code>## The following objects are masked from &#39;package:rgeos&#39;:
##
## intersect, setdiff, union
</code></pre>
<pre><code>## The following objects are masked from &#39;package:stats&#39;:
##
## filter, lag
</code></pre>
<pre><code>## The following objects are masked from &#39;package:base&#39;:
##
## intersect, setdiff, setequal, union
</code></pre>
<pre><code class="r"># construct a bounding box: 12 kilometers around Berlin
berlin = st_point(c(13.4034, 52.5120)) %&gt;%
st_sfc(crs = 4326) %&gt;%
st_transform(3857) %&gt;% # allow setting a buffer in meters
st_buffer(set_units(12, km)) %&gt;%
st_transform(4326) %&gt;% # the opensensemap expects WGS 84
st_bbox()
</code></pre>
<pre><code class="r">pm25 = osem_measurements(
berlin,
phenomenon = &#39;PM2.5&#39;,
from = now() - days(20), # defaults to 2 days
to = now()
)
plot(pm25)
</code></pre>
<p><img src="
<p>Now we can get started with actual spatiotemporal data analysis.
First, lets mask the seemingly uncalibrated sensors:</p>
<pre><code class="r">outliers = filter(pm25, value &gt; 100)$sensorId
bad_sensors = outliers[, drop = T] %&gt;% levels()
pm25 = mutate(pm25, invalid = sensorId %in% bad_sensors)
</code></pre>
<p>Then plot the measuring locations, flagging the outliers:</p>
<pre><code class="r">st_as_sf(pm25) %&gt;% st_geometry() %&gt;% plot(col = factor(pm25$invalid), axes = T)
</code></pre>
<p><img src="
<p>Removing these sensors yields a nicer time series plot:</p>
<pre><code class="r">pm25 %&gt;% filter(invalid == FALSE) %&gt;% plot()
</code></pre>
<p><img src="
<p>Further analysis: comparison with LANUV data <code>TODO</code></p>
</body>
</html>