COVID-19 decease animation map

April 1, 2020, 11:54 am

≫ Next: poorman: First Release of a base R dplyr Clone

≪ Previous: AutoML Frameworks in R & Python

[This article was first published on r.iresmi.net, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Coronavirus decease in France

# Animation carto décès COVID 19 France# avec lissage# sources -----------------------------------------------------------------# https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/fichier_covid <- "donnees/covid.csv"url_donnees_covid <- "https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7"# https://www.insee.fr/fr/statistiques/2012713#tableau-TCRD_004_tab1_departementsfichier_pop <- "donnees/pop.xls"url_donnees_pop <- "https://www.insee.fr/fr/statistiques/fichier/2012713/TCRD_004.xls"# Adminexpress : à télécharger manuellement# https://geoservices.ign.fr/documentation/diffusion/telechargement-donnees-libres.html#admin-express#aex <- "donnees/1_DONNEES_LIVRAISON_2019-03-14/"aex <- path_expand("~/Downloads/ADMIN-EXPRESS_2-2__SHP__FRA_2020-02-24/ADMIN-EXPRESS/1_DONNEES_LIVRAISON_2020-02-24")# config ------------------------------------------------------------------library(tidyverse)library(httr)library(fs)library(sf)library(readxl)library(janitor)library(glue)library(tmap)library(grid)library(classInt)library(magick)# + btb, raster, fasterize, plyrrayon <- 100000 # distance de lissage (m)pixel <- 10000 # résolution grille (m)force_download <- FALSE # retélécharger même si le fichier existe et a été téléchargé aujourd'hui ?#' Kernel weighted smoothing with arbitrary bounding area#'#' @param df sf object (points)#' @param field weight field in the df#' @param bandwidth kernel bandwidth (map units)#' @param resolution output grid resolution (map units)#' @param zone sf study zone (polygon)#' @param out_crs EPSG (should be an equal-area projection)#'#' @return a raster object#' @import btb, raster, fasterize, dplyr, plyr, sflissage <- function(df, field, bandwidth, resolution, zone, out_crs = 3035) {  if (st_crs(zone)$epsg != out_crs) {    message("reprojecting data...")    zone <- st_transform(zone, out_crs)  }    if (st_crs(df)$epsg != out_crs) {    message("reprojecting study zone...")    df <- st_transform(df, out_crs)  }    zone_bbox <- st_bbox(zone)    # grid generation  message("generating reference grid...")  zone_xy <- zone %>%    dplyr::select(geometry) %>%    st_make_grid(      cellsize = resolution,      offset = c(        plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),        plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor)      ),      what = "centers"    ) %>%    st_sf() %>%    st_join(zone, join = st_intersects, left = FALSE) %>%    st_coordinates() %>%    as_tibble() %>%    dplyr::select(x = X, y = Y)    # kernel  message("computing kernel...")  kernel <- df %>%    cbind(., st_coordinates(.)) %>%    st_set_geometry(NULL) %>%    dplyr::select(x = X, y = Y, field) %>%    btb::kernelSmoothing(      dfObservations = .,      sEPSG = out_crs,      iCellSize = resolution,      iBandwidth = bandwidth,      vQuantiles = NULL,      dfCentroids = zone_xy    )    # rasterization  message("\nrasterizing...")  raster::raster(    xmn = plyr::round_any(zone_bbox[1] - bandwidth, resolution, f = floor),    ymn = plyr::round_any(zone_bbox[2] - bandwidth, resolution, f = floor),    xmx = plyr::round_any(zone_bbox[3] + bandwidth, resolution, f = ceiling),    ymx = plyr::round_any(zone_bbox[4] + bandwidth, resolution, f = ceiling),    resolution = resolution  ) %>%    fasterize::fasterize(kernel, ., field = field)}# téléchargement--------------------------------------------------------------if (!dir_exists("donnees")) dir_create("donnees")if (!dir_exists("resultats")) dir_create("resultats")if (!dir_exists("resultats/animation")) dir_create("resultats/animation")if (!file_exists(fichier_covid) |    file_info(fichier_covid)$modification_time < Sys.Date() |    force_download) {  GET(url_donnees_covid,      progress(),      write_disk(fichier_covid, overwrite = TRUE))}if (!file_exists(fichier_pop)) {  GET(url_donnees_pop,      progress(),      write_disk(fichier_pop))}# données -----------------------------------------------------------------covid <- read_csv2(fichier_covid)# adminexpress prétéléchargédep <- read_sf(path(aex, "ADE_2-2_SHP_LAMB93_FR/DEPARTEMENT.shp")) %>%  clean_names() %>%  st_set_crs(2154)pop <- read_xls(fichier_pop, skip = 2) %>%  clean_names()# prétraitement -----------------------------------------------------------# contour métropole pour grille de référencefichier_fr <- "donnees/fr.rds"if (!file_exists(fichier_fr)) {  fr <- dep %>%    st_union() %>%    st_sf() %>%    write_rds(fichier_fr)} else {  fr <- read_rds(fichier_fr)}# jointures des donnéescreer_df <- function(territoire, date = NULL) {  territoire %>%    left_join(pop, by = c("insee_dep" = "x1")) %>%    left_join(      covid %>%        filter(jour == if_else(is.null(date), max(jour), date),               sexe == 0) %>%        group_by(dep) %>%        summarise(deces = sum(dc, na.rm = TRUE),                  reanim = sum(rea, na.rm = TRUE),                  hospit = sum(hosp, na.rm = TRUE)),      by = c("insee_dep" = "dep")) %>%    st_point_on_surface()}covid_geo_pop <- creer_df(dep)# lissage -----------------------------------------------------------------# génération de la dernière grille mortalité# et création des grilles pour 100000 habitants# décès métropole d <- covid_geo_pop %>%  lissage("deces", rayon, pixel, fr)# population métropole et DOMp <- covid_geo_pop %>%  lissage("x2020_p", rayon, pixel, fr)# grilles pour 100000 habd100k <- d * 100000 / p# classification à réutiliser pour les autres cartesset.seed(1234)classes <- classIntervals(raster::values(d100k), n = 5, style = "kmeans", dataPrecision = 0)$brks# animation ---------------------------------------------------------------image_animation <- function(date) {  m <- creer_df(dep, date) %>%    lissage("deces", rayon, pixel, fr) %>%    magrittr::divide_by(p) %>%    magrittr::multiply_by(100000) %>%    tm_shape() +    tm_raster(title = glue("décès à l'hôpital                         pour 100 000 hab."),              style = "fixed",              breaks = classes,              palette = "viridis",              legend.format = list(text.separator = "à moins de",                                   digits = 0),              legend.reverse = TRUE) +    tm_shape(dep) +    tm_borders() +    tm_layout(title = glue("COVID-19 - France - cumul au {date}"),              legend.position = c("left", "bottom"),              frame = FALSE) +    tm_credits(glue("http://r.iresmi.net/                  lissage noyau bisquare {rayon / 1000} km sur grille {pixel / 1000} km                  classif. kmeans                  projections LAEA Europe                  données départementales Santé publique France,                  INSEE RP 2020, IGN Adminexpress 2020"),               size = .5,               position = c(.5, .025))    tmap_save(m, glue("resultats/animation/covid_fr_{date}.png"),            width = 800, height = 800, scale = .4,)}unique(covid$jour) %>%  walk(image_animation)animation <- glue("resultats/deces_covid19_fr_{max(covid$jour)}.gif")dir_ls("resultats/animation") %>%  map(image_read) %>%  image_join() %>%  #image_scale("500x500") %>%  image_morph(frames = 10) %>%  image_animate(fps = 10, optimize = TRUE) %>%  image_write(animation)

var vglnk = { key: '949efb41171ac6ec1bf7f206d57e90b8' }; (function(d, t) {var s = d.createElement(t); s.type = 'text/javascript'; s.async = true;s.src = '//cdn.viglink.com/api/vglnk.js';var r = d.getElementsByTagName(t)[0]; r.parentNode.insertBefore(s, r); }(document, 'script'));

To leave a comment for the author, please follow the link and comment on their blog: r.iresmi.net.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

poorman: First Release of a base R dplyr Clone

April 1, 2020, 6:12 pm

≫ Next: Bootstrap resampling with #TidyTuesday beer production data

≪ Previous: COVID-19 decease animation map

[This article was first published on Random R Ramblings, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Introduction

The first official release of poorman (v 0.1.9) is now on CRAN! You can now install poorman directly from CRAN with the following code:

install.packages("poorman")

In this blog post I want to address some common questions that I have received since I started writing the package.

What is `poorman`?

poorman is a package that unapologetically attempts to recreate the dplyr API in a dependency free way using only base R. poorman is still under development and doesn’t have all of dplyr’s functionality but what I would consider the “core” functionality is included. The idea behind poorman is that a user should be able to take their dplyr based script and run it using poorman without any hiccups.

So what does `poorman` include?

In this first official release, poorman includes copies of the key dplyr functions.

select(), rename(), pull(), relocate(), mutate(), transmute(), arrange()filter(), slice()summarise() / summarize()group_by(), ungroup()

poorman also includes the join functionality.

inner_join(), left_join(), right_join(), full_join()anti_join(), semi_join()

Finally poorman also includes its own version of the pipe so you do not need to load or install magrittr.

%>%

More functionality is being developed and will be added in time.

Why develop `poorman`?

This is probably the most common question; why bother developing poorman when dplyr already exists. Well there are actually several reasons why I decided to develop it. The most important reason to me though is quite simply because I can. poorman started out as a personal challenge and a bit of fun. Also as a freelance R developer, it is good to build up my portfolio of open source code that I can show to potential clients.

Another reason for developing poorman is I wanted to challenge a common misconception that base R is not as powerful, or as good, or as useful as dplyr. Too often I see and hear comments belittling base R and as a user of the language for over 10 years now – well before the inception of dplyr– I find this idea very worrying. poorman’s package start up message is quite poignant in this regard.

I’d seen my father. He was a poor man, and I watched him do astonishing things. – Sidney Poitier

Finally, I have a natural joy of teaching. Writing poorman gives me a platform to hopefully show useRs two key aspects of R programming in base; common data manipulation tasks; and non-standard evaulation.

But why not just use `dplyr`?

Let’s be honest, the tidyverse is a fantastic set of packages which have transformed the face of data analysis in R, and dplyr is arguably one of the most important packages within the tidyverse. The API is in my opinion very easy to learn and use.

Being a part of the tidyverse, however, means that dplyr comes with a large number of dependencies that users must also install which is often seen as a disadvantage to using the tidyverse. Disadvantages of dependencies have been written about before and so I won’t go into detail here. However what I will say is that the user may not have a need for additional parts of the tidyverse and so may not wish to have to install multiple packages to use one or two functions.

Some of these dependencies are very useful of course, allowing expansion into other areas such as accessing Spark instances and databases using the same API the user already knows. This is great and if you are using these additional tools then I absolutely recommend that you choose dplyr over poorman. However if you don’t need the extra dependencies and functionality that comes with the wider tidyverse then maybe consider giving the lightweight poorman a go.

Finally a point on installation times, poorman takes roughly six seconds to install. If you’ve ever had to install or upgrade dplyr or the tidyverse, you’ll recognise that this is very fast.

Why the name `poorman`?

As I have already mentioned, I have seen comments in the past pertaining to R’s worthlessness without the tidyverse and so the name poorman is a subtle play on the the idea that you must be a “poor man” if you use base. The irony of course is that I have managed to recreate – quite easily – the key parts of the dplyr API using only base R.

Why not use `data.table` for the backend?

Because I wanted to build something that was completely dependency free and adding data.table as an Import adds a dependency.

But doesn’t `poorman` have dependencies?

To answer this, we need to define what we mean by “dependency free”. poorman does have some dependencies but they are for development purposes only and are therefore listed in the Suggests part of the DESCRIPTION file. Thus when a user installs the package, these dependencies are only ever installed if they are explicitly requested. However, poorman doesn’t have any dependencies that users of the package need to install in order to use its functionality. I use these dependency packages to help me develop more easily. Therefore poorman isn’t a truly “dependency free” like data.table is, but it is dependency free for its intended users.

Conclusion

So if you find yourself needing a dependency free data manipulation package that follows the dplyr API with short installation times then give poorman a try. Equally if you find any issues, please submit an issue to GitHub.

To leave a comment for the author, please follow the link and comment on their blog: Random R Ramblings.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Bootstrap resampling with #TidyTuesday beer production data

April 1, 2020, 5:00 pm

≫ Next: B is for bind_rows

≪ Previous: poorman: First Release of a base R dplyr Clone

[This article was first published on Rstats on Julia Silge, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

I’ve been publishing screencasts demonstrating how to use the tidymodels framework, from first steps in modeling to how to tune more complex models. Today, I’m using this week’s #TidyTuesday dataset on beer production to show how to use bootstrap resampling to estimate model parameters.

Here is the code I used in the video, for those who prefer reading instead of or in addition to video.

Explore the data

Our modeling goal here is to estimate how much sugar beer producers use relative to malt according to the #TidyTuesday dataset. We’ll use bootstrap resampling to do this!

First, let’s look at the data on brewing materials.

library(tidyverse)brewing_materials_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-31/brewing_materials.csv")brewing_materials_raw %>%  count(type, wt = month_current, sort = TRUE)

## # A tibble: 12 x 2##    type                                 n##                                ##  1 Total Used                 53559516695##  2 Total Grain products       44734903124##  3 Malt and malt products     32697313882##  4 Total Non-Grain products    8824613571##  5 Sugar and syrups            6653104081##  6 Rice and rice products      5685742541##  7 Corn and corn products      5207759409##  8 Hops (dry)                  1138840132##  9 Other                        998968470## 10 Barley and barley products   941444745## 11 Wheat and wheat products     202642547## 12 Hops (used as extracts)       33700888

How have some different brewing materials changed over time?

brewing_filtered <- brewing_materials_raw %>%  filter(    type %in% c(      "Malt and malt products",      "Sugar and syrups",      "Hops (dry)"    ),    year < 2016,    !(month == 12 & year %in% 2014:2015)  ) %>%  mutate(    date = paste0(year, "-", month, "-01"),    date = lubridate::ymd(date)  )brewing_filtered %>%  ggplot(aes(date, month_current, color = type)) +  geom_point()

There are strong annual patterns in these materials. We want to measure how much sugar beer producers use relative to malt.

brewing_materials <- brewing_filtered %>%  select(date, type, month_current) %>%  pivot_wider(    names_from = type,    values_from = month_current  ) %>%  janitor::clean_names()brewing_materials

## # A tibble: 94 x 4##    date       malt_and_malt_products sugar_and_syrups hops_dry##                                          ##  1 2008-01-01              374165152         78358212  4506546##  2 2008-02-01              355687578         80188744  1815271##  3 2008-03-01              399855819         78907213  6067167##  4 2008-04-01              388639443         81199989  6864440##  5 2008-05-01              411307544         89946309  7470130##  6 2008-06-01              415161326         81012422  7361941##  7 2008-07-01              405393784         76728131  1759452##  8 2008-08-01              389391266         83928121  5992025##  9 2008-09-01              362587470         71982604  3788942## 10 2008-10-01              353803777         42828943  3788949## # … with 84 more rows

brewing_materials %>%  ggplot(aes(malt_and_malt_products, sugar_and_syrups)) +  geom_smooth(method = "lm") +  geom_point()

There is a lot of variation in this relationship, but beer reproducers use more sugar when they use more malt. What is the relationship?

library(tidymodels)beer_fit <- lm(sugar_and_syrups ~ 0 + malt_and_malt_products,  data = brewing_materials)summary(beer_fit)

## ## Call:## lm(formula = sugar_and_syrups ~ 0 + malt_and_malt_products, data = brewing_materials)## ## Residuals:##       Min        1Q    Median        3Q       Max ## -29985291  -6468052    174001   7364462  23462837 ## ## Coefficients:##                        Estimate Std. Error t value Pr(>|t|)    ## malt_and_malt_products 0.205804   0.003446   59.72   <2e-16 ***## ---## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1## ## Residual standard error: 11480000 on 93 degrees of freedom## Multiple R-squared:  0.9746, Adjusted R-squared:  0.9743 ## F-statistic:  3567 on 1 and 93 DF,  p-value: < 2.2e-16

tidy(beer_fit)

## # A tibble: 1 x 5##   term                   estimate std.error statistic  p.value##                                      ## 1 malt_and_malt_products    0.206   0.00345      59.7 5.72e-76

Here I am choosing to set the intercept to zero to take a simplified view of the malt-sugar relationship (i.e., beer producers don’t use any sugar if they aren’t starting with malt). We could leave that off and estimate both an intercept (baseline use of sugar all the time) and slope (increase in use of sugar per barrel of malt).

This model and the visualization above are based on model assumptions that may not hold with our real-world beer production data. Bootstrap resampling provides predictions and confidence intervals that are more robust.

Bootstrap resampling

First, let’s create a set of bootstrap resamples.

set.seed(123)beer_boot <- bootstraps(brewing_materials, times = 1e3, apparent = TRUE)beer_boot

## # Bootstrap sampling with apparent sample ## # A tibble: 1,001 x 2##    splits          id           ##                      ##  1  Bootstrap0001##  2  Bootstrap0002##  3  Bootstrap0003##  4  Bootstrap0004##  5  Bootstrap0005##  6  Bootstrap0006##  7  Bootstrap0007##  8  Bootstrap0008##  9  Bootstrap0009## 10  Bootstrap0010## # … with 991 more rows

Next, let’s train a model to each of these bootstrap resamples. We can use tidy() with map() to create a dataframe of model results.

beer_models <- beer_boot %>%  mutate(    model = map(splits, ~ lm(sugar_and_syrups ~ 0 + malt_and_malt_products, data = .)),    coef_info = map(model, tidy)  )beer_coefs <- beer_models %>%  unnest(coef_info)beer_coefs

## # A tibble: 1,001 x 8##    splits     id        model term         estimate std.error statistic  p.value##                                        ##  1   malt_and_ma…    0.203   0.00326      62.3 1.31e-77##  2   malt_and_ma…    0.208   0.00338      61.7 3.17e-77##  3   malt_and_ma…    0.205   0.00336      61.1 7.30e-77##  4   malt_and_ma…    0.206   0.00361      57.1 3.26e-74##  5   malt_and_ma…    0.203   0.00349      58.3 4.77e-75##  6   malt_and_ma…    0.209   0.00335      62.2 1.33e-77##  7   malt_and_ma…    0.210   0.00330      63.7 1.73e-78##  8   malt_and_ma…    0.209   0.00359      58.2 5.52e-75##  9   malt_and_ma…    0.207   0.00342      60.5 1.74e-76## 10   malt_and_ma…    0.207   0.00378      54.9 1.14e-72## # … with 991 more rows

Evaluate results

What is the distribution of the relationship between sugar and malt?

beer_coefs %>%  ggplot(aes(estimate)) +  geom_histogram(alpha = 0.7, fill = "cyan3")

We can see where this distribution is centered and how broad it is from this visualization, and we can estimate these quantities using int_pctl() from the rsample package.

int_pctl(beer_models, coef_info)

## # A tibble: 1 x 6##   term                   .lower .estimate .upper .alpha .method   ##                                     ## 1 malt_and_malt_products  0.199     0.206  0.212   0.05 percentile

We can also visualize some of these fits to the bootstrap resamples. First, let’s use augment() to get the fitted values for each resampled data point.

beer_aug <- beer_models %>%  sample_n(200) %>%  mutate(augmented = map(model, augment)) %>%  unnest(augmented)beer_aug

## # A tibble: 18,800 x 13##    splits id    model coef_info sugar_and_syrups malt_and_malt_p… .fitted##                                     ##  1                     , .resid ,## #   .hat , .sigma , .cooksd , .std.resid

Then, let’s create a visualization.

ggplot(beer_aug, aes(malt_and_malt_products, sugar_and_syrups)) +  geom_line(aes(y = .fitted, group = id), alpha = .2, col = "cyan3") +  geom_point()

To leave a comment for the author, please follow the link and comment on their blog: Rstats on Julia Silge.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

B is for bind_rows

April 2, 2020, 7:00 am

≫ Next: RStudio Connect 1.8.2

≪ Previous: Bootstrap resampling with #TidyTuesday beer production data

[This article was first published on Deeply Trivial, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Moving on to the letter B, today we’ll talk about merging datasets that contain the same variables but add new cases. This is easily done with bind_rows. Let’s say I realized I forgot to log some of the books I read last year, and I wanted to merge those in to my existing dataset. I selected a handful of books from my to-read list, generated some read time and rating data, and saved the results in a csv file (which you can find here). Now I want to load my existing dataset and the new one:

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

##  ggplot2 3.2.1      purrr   0.3.3 ##  tibble  2.1.3      dplyr   0.8.3 ##  tidyr   1.0.0      stringr 1.4.0 ##  readr   1.3.1      forcats 0.4.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() -- ## x dplyr::filter() masks stats::filter() ## x dplyr::lag()    masks stats::lag()

reads2019<-read_csv("~/Downloads/Blogging A to Z/SarasReads2019.csv",col_names=TRUE)

## Parsed with column specification: ## cols( ##   Title = col_character(), ##   Pages = col_double(), ##   date_started = col_character(), ##   date_read = col_character(), ##   Book.ID = col_double(), ##   Author = col_character(), ##   AdditionalAuthors = col_character(), ##   AverageRating = col_double(), ##   OriginalPublicationYear = col_double(), ##   read_time = col_double(), ##   MyRating = col_double(), ##   Gender = col_double(), ##   Fiction = col_double(), ##   Childrens = col_double(), ##   Fantasy = col_double(), ##   SciFi = col_double(), ##   Mystery = col_double(), ##   SelfHelp = col_double() ## )

addreads<-read_csv("~/Downloads/Blogging A to Z/SarasAdds.csv")

## Parsed with column specification: ## cols( ##   Title = col_character(), ##   Pages = col_double(), ##   date_started = col_character(), ##   date_read = col_character(), ##   Book.ID = col_double(), ##   Author = col_character(), ##   AdditionalAuthors = col_character(), ##   AverageRating = col_double(), ##   OriginalPublicationYear = col_double(), ##   read_time = col_double(), ##   MyRating = col_double(), ##   Gender = col_double(), ##   Fiction = col_double(), ##   Childrens = col_double(), ##   Fantasy = col_double(), ##   SciFi = col_double(), ##   Mystery = col_double(), ##   SelfHelp = col_double() ## )

Now we just bind the two datasets together:

reads2019<-reads2019%>%bind_rows(addreads)

Did these additions change the ordering by page length?

reads2019<-reads2019%>%arrange(desc(Pages), Author)

head(reads2019)

## # A tibble: 6 x 18 ##   Title Pages date_started date_read Book.ID Author AdditionalAutho… ##                                   ## 1 The …  1216 6/12/2019    6/18/2019  3.30e1 Tolki…              ## 2 The …  1181 6/12/2019    6/17/2019  1.86e7 Atwoo…              ## 3 It     1156 8/14/2019    8/21/2019  2.79e7 King,…              ## 4 1Q84    925 9/3/2019     9/10/2019  1.04e7 Murak… Jay Rubin, Phil… ## 5 Inso…   890 8/10/2019    8/13/2019  1.06e4 King,… Bettina Blanch … ## 6 The …   592 8/18/2019    8/23/2019  1.16e4 King,…              ## # … with 11 more variables: AverageRating , OriginalPublicationYear , ## #   read_time , MyRating , Gender , Fiction , ## #   Childrens , Fantasy , SciFi , Mystery , SelfHelp

It did! The longest book is now The Lord of the Rings, at 1216 pages, and number two is The MaddAddam Trilogy, 1181 pages.

This is a pretty easy trick. Later on in this series, we’ll talk about combining datasets that share cases but add new variables – joins – which is one of the times the tidy data mindset becomes very important.

To leave a comment for the author, please follow the link and comment on their blog: Deeply Trivial.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

RStudio Connect 1.8.2

April 1, 2020, 5:00 pm

≫ Next: RQuantLib 0.4.12: Small QuantLib 1.18 update

≪ Previous: B is for bind_rows

[This article was first published on RStudio Blog, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

A big update for our Python community

One of the biggest frustrations for a data scientist, whether your primary language is R or Python, is to have your hard work go underutilized. A stream of disposable reports, emails, and presentations that get viewed once and cast aside are not the ideal recipe for how to make an impact. To combat this, we have seen data scientists create more interactive content (such as applications, APIs, and dashboards) to engage the divided attention of stakeholders. Unfortunately, delivering interactivity often comes at the cost of learning far more about IT and infrastructure than perhaps you had planned.

At RStudio, we believe data scientists shouldn’t have to become experts in DevOps just to share their work with the rest of their organization. RStudio Connect was created to handle the burden of deployment and provide a single platform for all the content your team produces in R and Python. Today we are excited to announce RStudio Connect 1.8.2, with new options for data scientists who use Python to share and communicate; including support for Python APIs (with Flask) and beta support for interactive Python applications with Dash.

Schedule a demo of RStudio Connect

Flask API Deployment

RStudio Connect 1.8.2 introduces support for Python API deployment, including applications built with Flask and other WSGI-compatible frameworks. This functionality lets data science teams make models developed in Python available as REST APIs. Once deployed, an RStudio Connect publisher can give other teams or services access to the API, securely delivering data science insights across their organization.

Python API Example on RStudio Connect 1.8.2

RStudio Connect automatically integrates with several Flask extension packages like Flask-RESTX, Flask-API, and Flasgger to provide web-accessible documentation or an API console interface. Examples for each of these extensions can be found in the User Guide.

Publishing a Python API to RStudio Connect requires the rsconnect-python package. This package is available to install with pip from PyPi and enables a command-line interface that can be used to publish from any Python IDE including PyCharm, VS Code, JupyterLab, Spyder, and others.

Developers with RStudio Connect publisher accounts can follow along with the new Python API Jump Start Example to learn the basic deployment workflow:

Flask Example in the Jump Start on RStudio Connect 1.8.2

Additional getting started information and examples can be found in the User Guide.

Download RStudio Connect 1.8.2

Beta Support for Dash Applications

Dash applications provide an easy way for Python users to create interactive applications and dashboards that help decision makers engage with their work.

Python users can develop Dash applications in the IDE of their choosing. Publishing an application to RStudio Connect is supported using the rsconnect-python package. Refer to the User Guide for details.

Once deployed to RStudio Connect, publishers can control access to their application, add viewers or collaborators, and adjust runtime settings to maximize performance or scale to meet audience demand.

Example Dash Application on RStudio Connect 1.8.2

This is a Dash application hosted on RStudio Connect that shows availability predictions for Washington DC’s docked bike-share stations. To see more examples like this, visit our Solutions Engineering website.

What does “Beta” Mean?Dash support is a beta feature which is still undergoing final testing before its official release. Should you encounter any bugs, glitches, lack of functionality or other problems, please let us know so we can improve before public release.

Learn how data science teams use RStudio products Visit R & Python – A Love Story

New & Notable

For Publishers, 1.8.2 makes it easy to share filtered content lists:

Share filtered content links in RStudio Connect 1.8.2

Easily share links to custom views of the content dashboard page, such as specific tags or search results.

For Administrators and Publishers, this release includes new default runtime settings that allow APIs and applications to scale more efficiently.
The User and Admin Guide documentation sites have been updated:
- Visit the Admin Guide
- Visit the User Guide

Security, Deprecations & Breaking Changes

Security Enforce locked user restrictions for active browser sessions.
Breaking Change The Postgres.URL database connection URL no longer supports the {$} password placeholder. The Postgres.URL automatically uses the Postgres.Password value without a placeholder.
Breaking Change The Postgres.InstrumentationURL database connection URL no longer supports the {$} password placeholder. The Postgres.InstrumentationURL automatically uses the Postgres.InstrumentationPassword value without a placeholder.
Breaking Change Due to breaking changes to the virtualenv package, Python installations must have a version of virtualenv below 20, e.g.: virtualenv<20. The version of setuptools must be 40.8 or higher. Incompatible versions will result in an error at startup.
Deprecation The settings SAML.IdPSigningCertificate and SAML.SPEncryptionCertificate would previously accept the contents of a certificate as a long Base64 inline value. This option is no longer supported and a warning will be issued during startup if used. Now, these settings only support a path to a PEM certificate file.
Deprecation The setting SAML.IdPMetaData has been deprecated. If this setting is currently used, a configuration migration will take place to transfer its value to either SAML.IdPMetaDataURL or SAML.IdPMetaDataPath. The deprecated metadata setting will be removed in a future release. The configuration migration for SAML settings is an automatic process that does not require immediate intervention but that will need a single manual step to be completed.

Please review the full version of the release notes available here.

Upgrade Planning
Due to breaking changes to the virtualenv package, Python installations must have a version of virtualenv below 20, e.g.: virtualenv<20. The version of setuptools must be 40.8 or higher. Incompatible versions will result in an error at startup. Review the documentation for more details. If you are upgrading from an earlier version, be sure to consult the release notes for the intermediate releases, as well.

Click through to learn more about RStudio Connect

To leave a comment for the author, please follow the link and comment on their blog: RStudio Blog.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

RQuantLib 0.4.12: Small QuantLib 1.18 update

April 2, 2020, 2:57 pm

≫ Next: Custom Power BI visual for Line chart with two Y-Axis

≪ Previous: RStudio Connect 1.8.2

[This article was first published on Thinking inside the box , and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

A new release 0.4.12 of RQuantLib arrived on CRAN today, and was uploaded to Debian as well.

QuantLib is a very comprehensice free/open-source library for quantitative finance; RQuantLib connects it to the R environment and language.

This version does relatively little. When QuantLib 1.18 came out, I immediately did my usual bit of packaging it for Debian as well creating binaries via my Ubuntu PPA so that I could test the package against it. And a few call from RQuantLib are now hitting interface functions marked as ‘deprecated’ leading to compiler nags. So I fixed that in PR #146. And today CRAN sent me email to please fix in the released version—so I rolled this up as 0.4.12. Not other changes.

Changes in RQuantLib version 0.4.12 (2020-04-01)
Changes in RQuantLib code:
Calls deprecated-in-QuantLib 1.18 were updated (Dirk in #146).

Courtesy of CRANberries, there is also a diffstat report for the this release. As always, more detailed information is on the RQuantLib page. Questions, comments etc should go to the new rquantlib-devel mailing list. Issue tickets can be filed at the GitHub repo.

If you like this or other open-source work I do, you can now sponsor me at GitHub. For the first year, GitHub will match your contributions.

This post by Dirk Eddelbuettel originated on his Thinking inside the box blog. Please report excessive re-aggregation in third-party for-profit settings.

var vglnk = { key: '949efb41171ac6ec1bf7f206d57e90b8' }; (function(d, t) { var s = d.createElement(t); s.type = 'text/javascript'; s.async = true; s.src = '//cdn.viglink.com/api/vglnk.js'; var r = d.getElementsByTagName(t)[0]; r.parentNode.insertBefore(s, r); }(document, 'script'));

To leave a comment for the author, please follow the link and comment on their blog: Thinking inside the box .

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Custom Power BI visual for Line chart with two Y-Axis

April 2, 2020, 11:15 pm

≫ Next: RcppSimdJson 0.0.4: Even Faster Upstream!

≪ Previous: RQuantLib 0.4.12: Small QuantLib 1.18 update

[This article was first published on R – TomazTsql, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Power BI support certain type of visuals that are by default available in the document. These are absolutely great and work perfectly fine, have a lot of capabilities to set properties and change the settings.

But every so often in past year, I have come across the fact that I wanted to visualize a typical line chart with two lines, each with it’s own Y-axis.

Let me give you a quick R example. First, very simple dataset, where there are two values, each with a slightly different scale: quantity in range from 499 to 760 and temperature in range from 15 to 24.

2020-04-02 22_10_44-RStudio

So what would normally happen, when we put this two ranges on same Y-axis:

# Both on the same y-axisplot(dataset[,3], type="l", col="red", lty=1, ylim=c(0,1000), ylab = "Qty and Temp")# Add second linelines(dataset[,2], col="brown",lty=1)legend("topleft", c("Temperature", "Quantity"),       col = c("red", "brown"), lty = c(1, 1))

With the following resutls:

2020-04-02 22_21_47-Plot Zoom

We see that the temperature has almost lost all the visible variance, where as, quantity still has some.

To deal with this, we need to shift one of the lines to right Y-axis. Following this, little additional coding:

par(mar = c(5, 5, 3, 5))plot(dataset[, 2], type ="l", ylab = "Quantity",main = "Quantity and temperature", xlab = "Date", col = "brown")par(new = TRUE)plot(dataset[,3], type = "l", xaxt = "n", yaxt = "n",ylab = "", xlab = "", col = "red", lty = 1)axis(side = 4)mtext("temperature", side = 4, line = 3)legend("topleft", c("Quantity", "Temperature"), col = c("brown", "red"),   lty = c(1, 1))

And the result is much more obvious:

2020-04-02 22_26_10-Plot Zoom

And now we can see how the temperature fluctuates (for more than 10°C).

So now that we have a solution, let’s go back to Power BI.

1. Default Visual

With the same dataset, this would be “out-of-the-box” visual, available to you:

2020-04-02 22_34_34-twolines_chart - Power BI Desktop

A combination of Bar chart and line chart, each with it’s own y-axis. Great. But If I want two lines, this is not possible with out of the box visuals.

2. Using R or Python

Copying the R code from above introduction example into Power BI, literally makes it the same in Power BI. Same logic goes and applies to Python.

2020-04-02 22_40_23-twolines_chart - Power BI Desktop

It does the job beautifully. Where is the trick with R or Python visuals. I have a couple:

to many times I have seen data engineers who start to use Power BI, that R or Python is just an overkill to adopt quickly,
it takes coding to plot a graph and not everyone has a great idea how to tackle this issue, and
publishing and deploying Power BI with R or Python on on-prem Power BI reporting server will not work with neither – R or Python – visual.

This said, there is a lot of gap for improvement.

3. Downloading custom visuals

Yes, you can download a custom visual. Go ahead

4. Building your own custom visual

This area is still under-explored among the data engineers and business intelligence people. And purpose of this post is, not only to point them to start exploring on their own, but also to show them, that is not a big deal to tinker on their own.

I have done my on this website: Charts PowerBI.

So, let’s walk through how to do it.

1. Get a sample dataset. You can download (data_twoCharts.csv) mine as well from Github.

2. Go to https://charts.powerbi.tips/ and select New.

3. Drag and drop the csv file into desired Data field, and you should get the preview of data:

2020-04-02 23_14_38-PowerBi.Tips - Charts

4. Click Done.

5. The the consists of 1) blank canvas pane on right hand side and 2) Glyph pane, 3) data Pane and 4) Layers Pane.

2020-04-02 23_17_11-PowerBi.Tips - Charts

The Canvas pane can hold multiple plot segments, which we will use to generate two plots, one on top of the other.

6. Drag and drop the individual data columns (from data pane) onto canvas pane. I did, first the date column, and drag it on top of X-axis (you will see, it will snap automatically on it), and Quantity on the Y-Axis.

2020-04-02 23_23_02-PowerBi.Tips - Charts

7. Drag the Symbols in Glyph Pane. And click on Links to Create Links. This will automatically connect all the dots.

2020-04-02 23_25_22-PowerBi.Tips - Charts 2020-04-02 23_25_38-PowerBi.Tips - Charts

8. You should get a graph like this.

2020-04-02 23_28_06-PowerBi.Tips - Charts

9. Great, half way done. Now resize the Plot Segment to reduce it to 50%. Click on blank canvas and Add anothoer Glyph, that will be associated with new Plot segment.

2020-04-02 23_36_27-PowerBi.Tips - Charts

10. Add another Plot segment (remember we are building two plot graphs, one on top of each other.)

2020-04-02 23_28_32-PowerBi.Tips - Charts 2020-04-02 23_30_13-PowerBi.Tips - Charts

11. With new Plot segment, repeat the step 6, 7 and 8. Drag the Date ( to X-axis) and Temperature (not quantity) (to Y-axis) to canvas pane, drag Symbol to Glyph Pane and click Links to Create Links. And you should end up with:

2020-04-02 23_38_00-PowerBi.Tips - Charts

12. On new Plot (on right hand side), we want to switch Y-axis to right side, by clickling on Layer Pane for selected Plot Segment. 2020-04-02 23_40_31-PowerBi.Tips - Charts

13. Last part is a bit finicky. With your mouse hover over the corner of left plot (and repeat with right plot), on green point and drag it over the other Plot. Yellow dotted line will appear to denote that you are expanding the graph.

2020-04-02 23_44_02-PowerBi.Tips - Charts

14. Once you do for both, there will be only “one” X-axis (one on top of the other), both Plot segments will be represented as layer on top of the layer.

2020-04-02 23_48_44-PowerBi.Tips - Charts

15. Optionally, some colour coding and graph design is super desired. This can be changed in the Layers Pane. Once you are happy and satisfied with your visual, export it as Power BI custom visual:

2020-04-02 23_48_56-PowerBi.Tips - Charts

and give it a proper name with labels for X and Y axis. These names will be visible in the Power BI document. And also, give a Custom Visual a proper name

2020-04-02 23_49_07-C__DataTK_git_DAX_Functions_04_Custom_Visual

Once you have done this, open Power BI and add it, the same way as all other additional/custom visuals:

2020-04-02 23_56_15-Useful_DAX_and_Power_BI_examples_for_everyday_usage.pptx - PowerPoint

From here on, you can use this visual with any dataset you want (it is not omitted to sample dataset you used for creating custom visual) and it is also ready to be published / deployed on on-prem Power BI Reporting server.

All code, data sample and Power BI document with custom visual are available on GitHub.

Happy PowerBI-ing.

To leave a comment for the author, please follow the link and comment on their blog: R – TomazTsql.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

RcppSimdJson 0.0.4: Even Faster Upstream!

April 3, 2020, 8:15 am

≫ Next: Exploring and Benchmarking Oxford Government Response Data

≪ Previous: Custom Power BI visual for Line chart with two Y-Axis

[This article was first published on Thinking inside the box , and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

A new (upstream) simdjson release was announced by Daniel Lemire earlier this week, and my Twitter mentions have been running red-hot ever since as he was kind enough to tag me. Do look at that blog post, there is some impressive work in there. We wrapped up the (still very simple) rcppsimdjson around it last night and shipped it this morning.

RcppSimdJson wraps the fantastic and genuinely impressive simdjson library by Daniel Lemire. Via some very clever algorithmic engineering to obtain largely branch-free code, coupled with modern C++ and newer compiler instructions, it results in parsing gigabytes of JSON parsed per second which is quite mindboggling. For illustration, I highly recommend the video of the recent talk by Daniel Lemire at QCon (which was also voted best talk). The best-case performance is ‘faster than CPU speed’ as use of parallel SIMD instructions and careful branch avoidance can lead to less than one cpu cycle use per byte parsed.

This release brings upstream 0.3 (and 0.3.1) plus a minor tweak (also shipped back upstream). Our full NEWS entry follows.

Changes in version 0.0.4 (2020-04-03)
Upgraded to new upstream releases 0.3 and 0.3.1 (Dirk in #9 closing #8)
Updated example validateJSON to API changes.

But because Daniel is such a fantastic upstream developer to collaborate with, he even filed a full feature-request ‘maybe you can consider upgrading’ as issue #8 at our repo containing the fully detailed list of changes. As it is so impressive I will simple quote the upper half of just the major changes:

Highlights
Multi-Document Parsing: Read a bundle of JSON documents (ndjson) 2-4x faster than doing it individually. API docs / Design Details
Simplified API: The API has been completely revamped for ease of use, including a new JSON navigation API and fluent support for error code and exception styles of error handling with a single API. Docs
Exact Float Parsing: Now simdjson parses floats flawlessly without any performance loss (https://github.com/simdjson/simdjson/pull/558). Blog Post
Even Faster: The fastest parser got faster! With a shiny new UTF-8 validator and meticulously refactored SIMD core, simdjson 0.3 is 15% faster than before, running at 2.5 GB/s (where 0.2 ran at 2.2 GB/s).

For questions, suggestions, or issues please use the issue tracker at the GitHub repo.

Courtesy of CRANberries, there is also a diffstat report for this release.

If you like this or other open-source work I do, you can now sponsor me at GitHub. For the first year, GitHub will match your contributions.

This post by Dirk Eddelbuettel originated on his Thinking inside the box blog. Please report excessive re-aggregation in third-party for-profit settings.

var vglnk = { key: '949efb41171ac6ec1bf7f206d57e90b8' }; (function(d, t) { var s = d.createElement(t); s.type = 'text/javascript'; s.async = true; s.src = '//cdn.viglink.com/api/vglnk.js'; var r = d.getElementsByTagName(t)[0]; r.parentNode.insertBefore(s, r); }(document, 'script'));

To leave a comment for the author, please follow the link and comment on their blog: Thinking inside the box .

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Exploring and Benchmarking Oxford Government Response Data

April 2, 2020, 5:00 pm

≫ Next: Introductory videos for Explanatory Model Analysis with R

≪ Previous: RcppSimdJson 0.0.4: Even Faster Upstream!

[This article was first published on An Accounting and Data Science Nerd's Corner, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Exploring and Benchmarking Oxford Government Response Data

Assessing the impact of Non-Pharmaceutical Interventions on the spread of Covid-19 requires data on Governmental measures. Luckily, the Assessment Capacities Project (ACAPS) and the Oxford Covid-19 Government Response Tracker both provide such data. In this blog post, I explore the new data provided by the Oxford initiative and compare it against the data provided by ACAPS that is already included in my {tidycovid19} package that offers download handles and some visualization tools for Covid-19 related data.

The publication of the Governance Tracker Data has spurred some interest by the media and the academic community and there are already studies using it. Its methodology is being presented in Hale, Thomas, Anna Petherick, Toby Phillips, Samuel Webster. “Variation in Government Responses to COVID-19” Version 3.0. Blavatnik School of Government Working Paper. March 31, 2020.

Downloading and data cleaning

Downloading the data form the Oxford homepage is straightforward. Automatic column detection by read_xlsx() fails so I provide columns manually.

suppressPackageStartupMessages({  library(kableExtra)  library(dplyr)  library(tidyr)  library(lubridate)  library(tidycovid19)  library(ggplot2)  library(stringr)  library(readxl)  library(gghighlight)  library(RCurl)})dta_url <- "https://www.bsg.ox.ac.uk/sites/default/files/OxCGRT_Download_latest_data.xlsx"tmp_file <- tempfile(".xlsx")utils::download.file(dta_url, tmp_file, mode = "wb")raw_data <- read_xlsx(  tmp_file,  col_types = c("text", "text", "numeric",                rep(c("numeric", "numeric", "text"), 6),                rep(c("numeric", "text"), 5), rep("numeric", 3), "skip"))

The file is organized by country-date and sorted by date. As in essence interventions data is event driven for each country (meaning that interventions happen infrequently at certain dates), I sort the data by country-date to get a better view on its structure. Also, I adjust some names and concentrate on the policy measures first, discarding the other data for the time being.

raw_data <- raw_data %>%  dplyr::rename(    country = CountryName,    iso3c = CountryCode,    date = Date  ) %>%  dplyr::mutate(date = lubridate::ymd(date)) %>%  arrange(iso3c, date)df <- raw_data %>%  select(-country, -ConfirmedCases, -ConfirmedDeaths, -ends_with("_Notes"),          -ends_with("_IsGeneral"), -StringencyIndex,         -starts_with(paste0("S", 8:11)))kable(df %>% head(20)) %>% kable_styling()

iso3c	date	S1_School closing	S2_Workplace closing	S3_Cancel public events	S4_Close public transport	S5_Public information campaigns	S6_Restrictions on internal movement	S7_International travel controls
ABW	2020-03-13	NA	NA	NA	NA	NA	NA	NA
ABW	2020-03-15	NA	NA	NA	NA	NA	NA	3
ABW	2020-03-16	2	NA	2	NA	NA	NA	3
ABW	2020-03-17	2	NA	2	NA	NA	NA	3
ABW	2020-03-18	2	NA	2	NA	NA	NA	3
ABW	2020-03-19	2	NA	2	NA	NA	NA	3
ABW	2020-03-20	2	NA	2	NA	NA	NA	3
ABW	2020-03-21	2	NA	2	NA	NA	2	3
ABW	2020-03-22	2	NA	2	NA	NA	2	3
ABW	2020-03-23	2	NA	2	NA	NA	2	3
ABW	2020-03-24	2	NA	2	NA	NA	2	3
ABW	2020-03-25	2	NA	2	NA	NA	2	3
ABW	2020-03-26	2	NA	2	NA	NA	2	3
ABW	2020-03-27	2	NA	2	NA	NA	2	3
ABW	2020-03-28	2	NA	2	NA	NA	2	3
ABW	2020-03-29	2	NA	2	NA	NA	2	3
ABW	2020-03-30	2	NA	2	NA	NA	2	3
ABW	2020-03-31	2	NA	2	NA	NA	2	3
AFG	2020-01-01	0	NA	0	NA	0	0	0
AFG	2020-01-02	0	NA	0	NA	0	0	0

You can see that at some point of time measures are introduced and then they are maintained. To make it more transparent which events are actually driving the values, I reorganize the data into an country-date-npi_type structure. This requires some shuffling and tidying as each NPI type has three variables and the actual type is captured in a variable name.

df <- raw_data# Fix column names for pivot_long()names(df)[seq(from = 4, by = 3, length.out = 7)] <- paste0("S", 1:7, "_measure") df <- df %>% select(1:23) %>%# S7 has no "IsGeneral" value. I attach an NA var for consistency  mutate(S7_IsGeneral = NA) %>%  pivot_longer(4:24, names_pattern = "(.*)_(.*)", names_to = c("type", ".value")) %>%  rename(npi_measure = measure, npi_is_general = IsGeneral, npi_notes = Notes)# Fix NPI type categorieslup <- tibble(  type = paste(paste0("S", 1:7)),  npi_type = sub("S\\d*_", "", names(raw_data)[seq(from = 4, by = 3, length.out = 7)]))oxford_pm <- df %>%   left_join(lup, by = "type") %>%  select(iso3c, country, date, npi_type, npi_measure, npi_is_general, npi_notes) %>%  arrange(iso3c, npi_type, date)# Let'#'s display an exampleoxford_pm %>%  filter(iso3c == "ABW" & npi_type == "Restrictions on internal movement") %>%  kable() %>% kable_styling()

iso3c	country	date	npi_type	npi_measure	npi_is_general	npi_notes
ABW	Aruba	2020-03-13	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-15	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-16	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-17	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-18	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-19	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-20	Restrictions on internal movement	NA	NA	NA
ABW	Aruba	2020-03-21	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-22	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-23	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-24	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-25	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-26	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-27	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-28	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-29	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-30	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html
ABW	Aruba	2020-03-31	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html

In this snippet of the data everything is sticky, even the notes. To remove these stale data from the sample, I next limit the sample to observations that differ from the country-day before. First rows are only kept if they contain non-missing data. Note that this does not discard information. It just helps making the data more parsimonious. Just compare the information on Aruba after the cleaning with the one above.

oxford_pm_events <- oxford_pm %>%  group_by(iso3c, npi_type) %>%  filter(    (row_number() == 1 &        (!is.na(npi_is_general) | !is.na(npi_measure) | !is.na(npi_notes)))  |      (is.na(lag(npi_is_general)) & !is.na(npi_is_general)) |       (is.na(lag(npi_measure)) & !is.na(npi_measure)) |       (is.na(lag(npi_notes)) & !is.na(npi_notes)) |       (!is.na(lag(npi_is_general)) & is.na(npi_is_general)) |       (!is.na(lag(npi_measure)) & is.na(npi_measure)) |       (!is.na(lag(npi_notes)) & is.na(npi_notes)) |       (lag(npi_is_general) != npi_is_general) |       (lag(npi_measure) != npi_measure) |       (lag(npi_notes) != npi_notes)  ) %>%  ungroup()oxford_pm_events %>%  filter(iso3c == "ABW" & npi_type == "Restrictions on internal movement") %>%  kable() %>% kable_styling()

iso3c	country	date	npi_type	npi_measure	npi_is_general	npi_notes
ABW	Aruba	2020-03-21	Restrictions on internal movement	2	NA	Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day. https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html

iso3c

country

date

npi_type

npi_measure

npi_is_general

npi_notes

ABW

Aruba

2020-03-21

Restrictions on internal movement

Curfew set (9PM until 6AM). Violations can be met with fines up to AWG 10,000 (over US$5000). Shops need to be closed by 8PM every day.

https://www.overheid.aw/actualidad/noticia_47171/item/decisionnan-tuma-pa-gobierno-relaciona-cu-e-crisis-di-coronavirus-covid-19_48506.html

When you go through the data in this format you will spot a set of minor inconsistencies:

Most of the time, notes are only added on the event date but sometimes, like in the example above for Aruba, they are stale. This makes it harder to identify redundant data.
Some countries are “initialized” with 0 values for some measures while others are not. I am not sure whether this difference is substantiated by data (most of these cases do not have notes, see below) or whether it is an artifact of data collection.
There are quite a few observations with zero measures that are classified as ‘general’ or not regardless. I am also not sure what this implies.
There are missing observations for some countries in recent dates, breaking the general principle that stale but still in-place measures are normally just written forward.
Many references in the notes variables are not authoritative even if authoritative resources should exist (more on this below).

Are there any odd cases?

Potentially odd cases could be where measures decrease over time. Let’s do a quick sanity check

oxford_pm_events %>%  group_by(iso3c, npi_type) %>%  filter(lead(npi_measure) < npi_measure | lag(npi_measure) > npi_measure) -> dfnrow(df)

## [1] 96

# Example Mexicodf %>%  filter(iso3c == "MEX") %>%  kable() %>% kable_styling()

iso3c	country	date	npi_type	npi_measure	npi_is_general	npi_notes
MEX	Mexico	2020-03-14	Cancel public events	1	1	March 14, The Health Secretariat recommends to keep a ‚Äúhealthy distance‚Äù and avoid non-essential working, starting on 23 of Ma
MEX	Mexico	2020-03-15	Cancel public events	0	0	NA
MEX	Mexico	2020-02-07	International travel controls	3	NA	NA
MEX	Mexico	2020-03-18	International travel controls	1	NA	NA
MEX	Mexico	2020-03-14	School closing	1	1	March 14, the Public Education Secretariat suspends classes from 23 of March until 19 of April. [https://www.gob.mx/salud/pren
MEX	Mexico	2020-03-15	School closing	0	0	NA
MEX	Mexico	2020-03-17	School closing	2	0	Although the national recommendation is to close schools until March 20, as Mexico is a Federation, some states have decided t
MEX	Mexico	2020-03-18	School closing	0	1	NA
MEX	Mexico	2020-03-14	Workplace closing	1	1	March 14, The Health Secretariat recommends to keep a ‚Äúhealthy distance‚Äù and avoid non-essential working, starting on 23 of Ma
MEX	Mexico	2020-03-15	Workplace closing	0	0	NA

While many of those cases seem to be supported by notes and are thus likely to consistent, the Mexican example shows a recurrent pattern: Sometimes measures are seemingly “revoked” just one day later with no note supporting the data. This could be an artifact of accidentally mixing level measures with event measures. In addition, it appears the notes are truncated and they seem to indicate that the measures were meant to be effective on March 23, a fact that is not captured in the data.

Comparing number of interventions and notes coverage with ACAPS data

Because of the above mentioned inconsistencies in the data, assessing the actual number of coded interventions is non-trivial. I assume that an intervention is defined either by a note that is only attached to a specific date (but not to the date before or after) or by a change in the measurement.

oxford_pm_events %>%  group_by(iso3c, npi_type) %>%  filter((row_number() == 1 )|            (lag(npi_measure) != npi_measure) |            (lag(npi_is_general) != npi_is_general) |            (!is.na(npi_notes) & (lag(npi_notes) != npi_notes))) %>%  mutate(notes_avail = !is.na(npi_notes)) %>%  ungroup() -> opeaddmargins(table(ope$npi_type, ope$notes_avail))

##                                    ##                                     FALSE TRUE  Sum##   Cancel public events                101  144  245##   Close public transport               83   79  162##   International travel controls        94  266  360##   Public information campaigns         85  136  221##   Restrictions on internal movement    91  138  229##   School closing                       96  160  256##   Workplace closing                    93  136  229##   Sum                                 643 1059 1702

acaps_df <- download_acaps_npi_data(cached = TRUE, silent = TRUE) %>%  mutate(notes_avail = !is.na(link))addmargins(table(acaps_df$category, acaps_df$notes_avail))

##                               ##                                FALSE TRUE  Sum##   Humanitarian exemption           0    2    2##   Lockdown                         0  102  102##   Movement restrictions            7  948  955##   Public health measures           3 1086 1089##   Social and economic measures     1  520  521##   Social distancing                6  702  708##   Sum                             17 3360 3377

The ACAPS data has 60 % more interventions and almost full coverage with sources. In the Oxford dataset, currently roughly 60 % of the identified interventions are backed with sources but this might well be an artifact of my intervention identification approach.

Let’s see how source coverage varies by be measurement magnitude for the Oxford data.

addmargins(table(ope$npi_measure, ope$notes_avail))

##      ##       FALSE TRUE  Sum##   0     584   35  619##   1      30  374  404##   2      26  531  557##   3       3  116  119##   Sum   643 1056 1699

This seems to be the case. The “zero measures” have only rarely notes attached. The non-zero measures look much better in terms of coverage. Yet another reason not to use the zero measures.

How does the quality of the notes compare? To get an idea about this I compare the urls included in the notes for the Mexican cases

url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"ope %>%   mutate(link = str_extract(npi_notes, url_pattern)) %>%   select(iso3c, date, link) %>% na.omit() %>%  arrange(date) -> oxford_urlsoxford_urls %>%  filter(iso3c == "MEX") %>%  select(-iso3c) %>%  kable() %>% kable_styling()

date	link
2020-02-06	https://www.gob.
2020-03-12	https://www.gob.mx/salu
2020-03-14	https://www.gob.mx/salud/pren
2020-03-16	https://www.excelsior.com.mx/comunidad/cancelan-eventos-masivos-en-cdmx
2020-03-18	https://www.gob.mx/salu
2020-03-20	https://uk.reuters.com/article/uk-health-coronavirus-mexico/mexico-braces-for-coronavirus-lasting-all-year-tightens-curbs-idUKKBN211145
2020-03-21	https://uk.reuters.com/article/uk-health-coronavirus-mexico/mexico-braces-for-coronavirus-lasting-all-year-tightens-curbs-idUKKBN211145
2020-03-24	https://uk.reuters.com/article/uk-health-coronavirus-mexico/mexico-braces-for-coronavirus-lasting-all-year-tightens-curbs-idUKKBN211145
2020-03-24	https://www.lexology.com/library/detail.aspx?g=e01c939c-5cee-45d5-93db-a7c98164e394

bind_rows(  acaps_df %>%     mutate(date = as_date(date_implemented)) %>%     select(iso3c, date, link),  acaps_df %>%      select(iso3c, date_implemented, `alternative source`) %>%     mutate(date = as_date(date_implemented)) %>%     rename(link = `alternative source`) %>%    select(-date_implemented)) %>%   mutate(link = str_extract(link, url_pattern)) %>%   na.omit() %>%  arrange(date) -> acaps_urls acaps_urls %>%  filter(iso3c == "MEX") %>%  select(-iso3c) %>%  kable() %>% kable_styling()

date	link
2020-03-20	https://mx.usembassy.gov/covid-19-information/
2020-03-21	https://www.gov.uk/foreign-travel-advice/mexico/coronavirus
2020-03-23	https://mx.usembassy.gov/covid-19-information/
2020-03-23	https://mx.usembassy.gov/covid-19-information/
2020-03-26	https://www.gov.uk/foreign-travel-advice/mexico/coronavirus
2020-03-30	https://coronavirus.gob.mx/2020/03/30/consejo-de-salubridad-general-declara-emergencia-sanitaria-nacional-a-epidemia-por-coronavirus-covid-19/
2020-03-30	https://coronavirus.gob.mx/2020/03/30/consejo-de-salubridad-general-declara-emergencia-sanitaria-nacional-a-epidemia-por-coronavirus-covid-19/
2020-03-30	https://coronavirus.gob.mx/2020/03/30/consejo-de-salubridad-general-declara-emergencia-sanitaria-nacional-a-epidemia-por-coronavirus-covid-19/

You see that some of the Oxford URLs seem truncated and most do not point to governmental resources directly while the ACAPS URLs all seem to link to authoritative sources. Last check on this. How many URLs return an OK header, meaning that they can be reached but not necessarily that they will still return the required data. I test this on a sample of 100 URls from both sources.

return_pct_valid_urls <- function(df, n = 100) {  urls <- df %>% sample_n(n) %>% pull(link)  works <- sapply(urls, url.exists)  sum(works)/n}return_pct_valid_urls(oxford_urls, 100)

## [1] 0.81

return_pct_valid_urls(acaps_urls, 100)

## [1] 0.94

It appears that the source URLs provided by ACAPS are in better shape. Time to compare the two data sources in terms of actual measures. Let’s first look at the coverage across countries.

acaps <- download_acaps_npi_data(cached = TRUE, silent = TRUE) acaps %>% select(iso3c) %>% unique() %>% nrow()

## [1] 182

raw_data %>% select(iso3c) %>% unique() %>% nrow()

## [1] 190

oxford_pm_events %>% filter(npi_measure > 0) %>% select(iso3c) %>% unique() %>% nrow()

## [1] 90

The ACAPS data covers a much wider array of countries but the Oxford data also spans an impressive list of countries. While their raw data file contains 190 country identifiers it seems to contain actual data currently for 90 countries. In their documentation, the team states that they have collected data for 77 countries but that they plan to enlarge their sample.

To compare the intervention measures themselves, as the categories are not comparable, I compare a ranked measure of the appropriate ACAPS measures with the Stringency Measure of the Oxford data.

 download_merged_data(cached = TRUE, silent = TRUE) %>%  mutate(acaps_score = 100*((soc_dist/max(soc_dist, na.rm = TRUE) +            mov_rest/max(mov_rest, na.rm = TRUE) + lockdown)/3)) %>%  mutate(acaps_score = 100*percent_rank(acaps_score)) %>%   left_join(raw_data %>%               rename(oxford_si = StringencyIndex) %>%               select(iso3c, date, oxford_si),            by = c("iso3c", "date"))  %>%  select(iso3c, date, acaps_score, oxford_si) -> dfsummary(df)

##     iso3c                date             acaps_score      oxford_si     ##  Length:12994       Min.   :2020-01-22   Min.   : 0.00   Min.   :  0.00  ##  Class :character   1st Qu.:2020-02-09   1st Qu.: 0.00   1st Qu.:  0.00  ##  Mode  :character   Median :2020-02-27   Median : 0.00   Median : 14.00  ##                     Mean   :2020-02-27   Mean   :30.37   Mean   : 24.35  ##                     3rd Qu.:2020-03-16   3rd Qu.:74.51   3rd Qu.: 38.00  ##                     Max.   :2020-04-03   Max.   :99.98   Max.   :100.00  ##                                                          NA's   :7639

df %>%   pivot_longer(3:4, names_to = "source", values_to = "measure") %>%  filter(!is.na(measure)) %>%  group_by(date, source) %>%  summarize(    mn = mean(measure),    se = sd(measure)/sqrt(n())  ) %>%  ggplot(aes(x = date, y = mn, color = source)) +    geom_pointrange(      aes(ymin = mn-1.96*se, ymax = mn+1.96*se),      position=position_dodge(0.4)    )

df %>%  filter(!is.na(oxford_si) & !is.na(acaps_score)) %>%  group_by(iso3c) %>%  summarise(oxford_si = mean(oxford_si),            acaps_score = mean(acaps_score)) %>%  ggplot(aes(x = oxford_si, y = acaps_score)) + geom_point() +  gghighlight(abs(oxford_si - acaps_score) > 30, label_key = iso3c)

The two measures are clearly correlated but it also becomes apparent that the country-level averages vary significantly. Thus, it seems likely that the choice of the data source might have an impact on research findings.

Replicating the Oxford Government Response Stringency Index

The team of the Oxford Blavatnik School has constructed an aggregate “stringency” measure. Many people will be tempted to use this measure as an overall indicator for the country-level intensity of interventions. Thus, I try to reproduce this measure to assess its internal validity.

From the working paper documenting the dataset:

Our baseline measure of variation in governments’ responses is the COVID-19 Government Response Stringency Index (Stringency Index). For each ordinal policy response measure S1-S7, we create a score by taking the ordinal value and adding one if the policy is general rather than targeted, if applicable. This creates a score between 0 and 2 and for S5, and 0 and 3 for the other six responses. We then rescale each of these by their maximum value to create a score between 0 and 100, with a missing value contributing 0. These seven scores are then averaged to get the composite Stringency Index.

I implement this approach using the original data

si <- oxford_pm %>%  group_by(iso3c, date) %>%  summarise(delete = all(is.na(npi_measure)) & all(is.na(npi_is_general))) %>%  left_join(oxford_pm, by = c("iso3c", "date")) %>%  filter(!delete) %>%  select(-delete) %>%  mutate(    npi_measure = replace_na(npi_measure, 0),    npi_is_general = replace_na(npi_is_general, 0)  ) %>%  group_by(npi_type) %>%  mutate(score = (npi_measure + npi_is_general)/max(npi_measure + npi_is_general)) %>%  group_by(iso3c, date) %>%  summarise(si_100 = round(100*mean(score))) df <- raw_data %>% select(iso3c, date, StringencyIndex) %>%  left_join(si, by = c("iso3c", "date"))summary(df)

##     iso3c                date            StringencyIndex      si_100      ##  Length:10561       Min.   :2020-01-01   Min.   :  0.00   Min.   :  0.00  ##  Class :character   1st Qu.:2020-01-26   1st Qu.:  0.00   1st Qu.:  0.00  ##  Mode  :character   Median :2020-02-20   Median :  5.00   Median : 10.00  ##                     Mean   :2020-02-18   Mean   : 19.59   Mean   : 20.68  ##                     3rd Qu.:2020-03-15   3rd Qu.: 29.00   3rd Qu.: 29.00  ##                     Max.   :2020-03-31   Max.   :100.00   Max.   :100.00  ##                                          NA's   :3280     NA's   :3046

ggplot(df, aes(x = StringencyIndex, y = si_100)) +  geom_point(alpha = 0.2) + theme_minimal()

## Warning: Removed 3280 rows containing missing values (geom_point).

Not all observations have identical values. There is a substantial amount of data where my reproduced measure has higher values compared to the measure reported by the Oxford team. After inspecting the data I got the impression that the Oxford team does not add the ‘is_general’ value when the ‘measure’ value for a certain intervention is zero. Testing this conjecture yields the following.

si <- oxford_pm %>%  group_by(iso3c, date) %>%  summarise(delete = all(is.na(npi_measure)) & all(is.na(npi_is_general))) %>%  left_join(oxford_pm, by = c("iso3c", "date")) %>%  filter(!delete) %>%  select(-delete) %>%  mutate(    npi_measure = replace_na(npi_measure, 0),    npi_is_general = replace_na(npi_is_general, 0)  ) %>%  group_by(npi_type) %>%  mutate(score = ifelse(npi_measure > 0,                         npi_measure + npi_is_general,                         npi_measure)/max(npi_measure + npi_is_general)) %>%  group_by(iso3c, date) %>%  summarise(si_100 = round(100*mean(score))) df <- raw_data %>% select(iso3c, date, StringencyIndex) %>%  left_join(si, by = c("iso3c", "date"))summary(df)

##     iso3c                date            StringencyIndex      si_100      ##  Length:10561       Min.   :2020-01-01   Min.   :  0.00   Min.   :  0.00  ##  Class :character   1st Qu.:2020-01-26   1st Qu.:  0.00   1st Qu.:  0.00  ##  Mode  :character   Median :2020-02-20   Median :  5.00   Median :  5.00  ##                     Mean   :2020-02-18   Mean   : 19.59   Mean   : 19.43  ##                     3rd Qu.:2020-03-15   3rd Qu.: 29.00   3rd Qu.: 29.00  ##                     Max.   :2020-03-31   Max.   :100.00   Max.   :100.00  ##                                          NA's   :3280     NA's   :3046

ggplot(df, aes(x = StringencyIndex, y = si_100)) +  geom_point(alpha = 0.2) + theme_minimal()

## Warning: Removed 3280 rows containing missing values (geom_point).

Now that works. As zero measures lead to the exclusion of both variables (‘measure’ and ‘is_general’) from the aggregated score the reliability of the zero measures seems even more questionable.

A quick look at the financial measures

The Oxford dataset also contains some financial measures. Let’s see.

df <- raw_data %>%  rename(    fisc_measures = `S8_Fiscal measures`,    mon_measures = `S9_Monetary measures`,    inv_health_care = `S10_Emergency investment in health care`,    inv_vaccines = `S11_Investment in Vaccines`  ) %>%  select(iso3c, date, fisc_measures, mon_measures, inv_health_care, inv_vaccines)summary(df)

##     iso3c                date            fisc_measures        mon_measures   ##  Length:10561       Min.   :2020-01-01   Min.   :0.000e+00   Min.   :-0.750  ##  Class :character   1st Qu.:2020-01-26   1st Qu.:0.000e+00   1st Qu.: 0.000  ##  Mode  :character   Median :2020-02-20   Median :0.000e+00   Median : 0.750  ##                     Mean   :2020-02-18   Mean   :2.496e+09   Mean   : 2.549  ##                     3rd Qu.:2020-03-15   3rd Qu.:0.000e+00   3rd Qu.: 3.000  ##                     Max.   :2020-03-31   Max.   :2.050e+12   Max.   :55.000  ##                                          NA's   :4384        NA's   :4935    ##  inv_health_care     inv_vaccines      ##  Min.   :0.00e+00   Min.   :        0  ##  1st Qu.:0.00e+00   1st Qu.:        0  ##  Median :0.00e+00   Median :        0  ##  Mean   :1.86e+08   Mean   :   969677  ##  3rd Qu.:0.00e+00   3rd Qu.:        0  ##  Max.   :1.50e+11   Max.   :286175609  ##  NA's   :5068       NA's   :5158

A lot of zeros. Again, I am uncertain what separates missing values from zero. The ‘mon_measure’ variable captures the ‘Value of interest rate’ (economist cringes). From the notes I get the impression that mostly, central bank interest rates have been collected on a arbitrary basis (the value of 55 % is actually OK. It’s from Argentina). As an economist I would not use that data but rather turn to specialized data sources, like, e.g., data provided by the International Monetary Fund.

The budgetary information is potentially more interesting. Unfortunately, however, it appears to be inconsistently collected. First, there are small values present in the data. Given that the data (besides monetary measures) are denominated in US-$ these are most likely data errors (in particular the 1 US-$ values that appear to be miss-coded ordinal data)

df %>% filter(fisc_measures < 1e6 & fisc_measures > 0 |                inv_health_care < 1e6 & inv_health_care > 0 |                inv_vaccines < 1e6 & inv_vaccines > 0) %>%  select(-mon_measures)

## # A tibble: 61 x 5##    iso3c date       fisc_measures inv_health_care inv_vaccines##                                     ##  1 BRB   2020-03-14            0                1            0##  2 CHL   2020-01-16            0           304204           NA##  3 DOM   2020-03-17            1               NA            0##  4 DOM   2020-03-18            1               NA           NA##  5 ESP   2020-01-31            0                0       246961##  6 FIN   2020-03-19       536507               NA           NA##  7 IRL   2020-03-29          311.             585            0##  8 IRQ   2020-02-25           NA           420168           NA##  9 ISR   2020-02-02            0                0            1## 10 ISR   2020-02-03            0                0            1## # … with 51 more rows

More importantly, it seems as if part of the data is being coded as events, whereas other parts of data are coded as levels (with values being positive and stable over time). Compare, as an example, Canada and Germany.

df %>% filter(iso3c == "DEU" | iso3c == "CAN", date > ymd("2020-03-01")) %>%  ggplot(aes(x = date, y = fisc_measures, color = iso3c)) + geom_line() + geom_point()

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

Summary

I applaud the Oxford team for crowd-sourcing such an impressive dataset in such a short period of time. However, given the current status of the data, I cannot advise to use the financial measurement data.

The main data, the policy measures, seem to be in better shape. Nevertheless, also these items do not come without issues. The organization in wide format creates redundant data and introduces as well as conceals potential coding errors. The distinction between zero and missing values is unclear. Later days in March sometimes have missing values. The calculation of the Stringency Index is not described in sufficient detail to warrant effortless reproduction. While generally, policy measures are coded as levels it appears as if in some cases they are coded as interventions instead. The notes to the policy measures could be more authoritative.

Compared to the Oxford data, the ACAPS data spans more countries, has more observations, finer categories, provides also some information on the regional structure of interventions, comes in a tidier format and has more authoritative sources included. Comparing the measures provided by both data sources shows that, while both exhibit clearly similar patterns, country-level averages vary considerably. This implies that the choice of the data source might have an impact on research findings.

My hope is that this review is helpful in improving the integrity of these important data sources as high quality data on non-pharmaceutical interventions will be instrumental to assess their effects going forward.

var vglnk = { key: '949efb41171ac6ec1bf7f206d57e90b8' }; (function(d, t) {var s = d.createElement(t); s.type = 'text/javascript'; s.async = true;s.src = '//cdn.viglink.com/api/vglnk.js';var r = d.getElementsByTagName(t)[0]; r.parentNode.insertBefore(s, r); }(document, 'script'));

To leave a comment for the author, please follow the link and comment on their blog: An Accounting and Data Science Nerd's Corner.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Introductory videos for Explanatory Model Analysis with R

April 3, 2020, 4:07 am

≫ Next: C is for coalesce

≪ Previous: Exploring and Benchmarking Oxford Government Response Data

[This article was first published on Stories by Przemyslaw Biecek on Medium, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Remote teaching at my university encouraged me to prepare some video materials for Explanatory Model Analysis techniques, i.e. techniques of exploration, explanation and visualisation of predictive models.

The pyramid for Explanatory Model Analysis. Left part is focused on a single observation (instance-level or local explanations). Right part is focused on the whole data set (called dataset level or global explanations). In the first row are raw model predictions and model statistics. The second row is related to the analysis of model parts. The third row describes the profile of model behaviour depending on the values of individual variables. The fourth row corresponds to the residual diagnostics.

For example, for a model that estimates odds of default in credit scoring, the model level analysis focuses on explaining the model’s behaviour for a selected population of customers.

The ebook describing these techniques was already available at https://pbiecek.github.io/ema/. Today I have published a youtube playlist with short videos showing how to use these techniques in R.

For now, the first 5 videos are available for instance level analysis. These videos are focused on the DALEX package for R. You can read about other R packages for XAI and more about the methodology in the ebook.

Here are these videos

Gentle introduction to the topic of exploration, why this topic is important and how it fits into the model building process.

<a href="https://medium.com/media/732d498927bf677496bd7d7aa0bfb9f0/href" rel="nofollow" target="_blank">https://medium.com/media/732d498927bf677496bd7d7aa0bfb9f0/href</a>

2. Introduction to the DALEX package and other tools from the DrWhy.AI family (e.g. modelDown and modelStudio)

<a href="https://medium.com/media/ac7c2af69f52d6890d1337391f8664cf/href" rel="nofollow" target="_blank">https://medium.com/media/ac7c2af69f52d6890d1337391f8664cf/href</a>

3. Instance level attribution with break-down method for determining variables important for a prediction. The break-down method decomposes predictions of any model into parts that can be attributed to model variables.

<a href="https://medium.com/media/57950da4f2f64929179b880df76b3b3d/href" rel="nofollow" target="_blank">https://medium.com/media/57950da4f2f64929179b880df76b3b3d/href</a>

4. Instance level attribution with Shapley values and break-down with interactions. Both extends break downs possibilities.

<a href="https://medium.com/media/81f7c07a3e30f371a052fc8f9247a92d/href" rel="nofollow" target="_blank">https://medium.com/media/81f7c07a3e30f371a052fc8f9247a92d/href</a>

5. Presentation of the ceteris paribus method for the analysis of model response profile for selected variables for the indicated observation.

<a href="https://medium.com/media/b321ec84dbf01a4f7262a91c67a032e7/href" rel="nofollow" target="_blank">https://medium.com/media/b321ec84dbf01a4f7262a91c67a032e7/href</a>

The whole playlist is at https://www.youtube.com/playlist?list=PLGzKiXahhU63NmyM7sALiVtFYTUYEpTVm

Supplementary materials are at http://dalex.drwhy.ai/.

Now we’re working on videos for the python version of this package. Stay tuned!

Questions or comments? Let me know at ema at drwhy.ai.

Warm thanks to everyone who helped me to prepare these videos, especially Wojciech Kretowicz, Huber Baniecki, Alicja Gosiewska, Anna Kozak and Katarzyna Woźnica.

To leave a comment for the author, please follow the link and comment on their blog: Stories by Przemyslaw Biecek on Medium.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

C is for coalesce

April 3, 2020, 7:00 am

≫ Next: COVID-19 shiny / plotly dashboard

≪ Previous: Introductory videos for Explanatory Model Analysis with R

[This article was first published on Deeply Trivial, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

For the letter C, we’ll talk about the coalesce function. If you’re familiar with SQL, you may have seen this function before. It combines two or more variables into a single column, and is a way to deal with missing data. When you give it a list of variables, it selects the first non-missing value it finds. Because of that, order of entry of those variables is important.

I recently used this function at work to get last performance on an item. That is, diplomates in our program take spaced assessments, with repetition of items (same content, but slightly different item wording) that they answered incorrectly. If they got the initial item correct, they may never see a related item again (although over time they will), but if they were incorrect, they may get related items once or more times to see if they’re learning. So we have some people who will have a missing value for the repeated item and others who will have correct/incorrect. The coalesce function allowed me to get their final performance on item content by telling it to first check the repeated item column and then, if that’s missing, to look at the initial item column.

How can we use the coalesce function for the reading dataset? I noticed that for 17 books in the dataset, I forgot to supply a rating, so they’re showing up as 0. Let’s select those titles, and I can figure out how I’d like to rate them.

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

##  ggplot2 3.2.1      purrr   0.3.3 ##  tibble  2.1.3      dplyr   0.8.3 ##  tidyr   1.0.0      stringr 1.4.0 ##  readr   1.3.1      forcats 0.4.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() -- ## x dplyr::filter() masks stats::filter() ## x dplyr::lag()    masks stats::lag()

reads2019<-read_csv("~/Downloads/Blogging A to Z/SarasReads2019.csv",col_names=TRUE)

## Parsed with column specification: ## cols( ##   Title = col_character(), ##   Pages = col_double(), ##   date_started = col_character(), ##   date_read = col_character(), ##   Book.ID = col_double(), ##   Author = col_character(), ##   AdditionalAuthors = col_character(), ##   AverageRating = col_double(), ##   OriginalPublicationYear = col_double(), ##   read_time = col_double(), ##   MyRating = col_double(), ##   Gender = col_double(), ##   Fiction = col_double(), ##   Childrens = col_double(), ##   Fantasy = col_double(), ##   SciFi = col_double(), ##   Mystery = col_double(), ##   SelfHelp = col_double() ## )

unrated<-reads2019%>%filter(MyRating==0)

list(unrated$Title)

## [[1]] ##  [1] "Bird Box (Bird Box, #1)"                             ##  [2] "Elevation"                                           ##  [3] "Glinda of Oz (Oz, #14)"                              ##  [4] "It"                                                  ##  [5] "Just Evil (Evil Secrets Trilogy, #1)"                ##  [6] "Oryx and Crake (MaddAddam, #1)"                      ##  [7] "Precarious Pasta (Cozy Corgi Mysteries, #14)"        ##  [8] "Redshirts"                                           ##  [9] "Rinkitink in Oz (Oz, #10)"                           ## [10] "Summerdale"                                          ## [11] "The 5 Love Languages: The Secret to Love That Lasts" ## [12] "The Long Mars (The Long Earth, #3)"                  ## [13] "The Magic of Oz (Oz, #13)"                           ## [14] "The Marvelous Land of Oz (Oz, #2)"                   ## [15] "The Scarecrow of Oz (Oz, #9)"                        ## [16] "The Tin Woodman of Oz (Oz, #12)"                     ## [17] "Tik-Tok of Oz (Oz, #8)"

So now I want to generate a variable with my new ratings, and merge it into my reads dataset.

unrated$NewRating<-c(3,5,4,4,2,4,4,5,4,3,3,4,4,5,3,3,4)

unrated<-unrated%>%select(Book.ID, NewRating)

reads2019<-reads2019%>%left_join(unrated,by="Book.ID")

Finally, we’ll create our final ratings variable by coalescing the two ratings variables. I want it to first look at the new rating column, so it can capture those updates. For rows missing that variable (because I already rated them), it will look next at the original rating variable.

reads2019<-reads2019%>%mutate(FinalRating=coalesce(NewRating, MyRating))

table(reads2019$FinalRating)

##  ##  2  3  4  5  ##  2  9 51 25

No more 0s!

To leave a comment for the author, please follow the link and comment on their blog: Deeply Trivial.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

COVID-19 shiny / plotly dashboard

April 3, 2020, 8:54 am

≫ Next: Why R? Webinar on R/exams for E-Learning Quizzes and Beyond

≪ Previous: C is for coalesce

[This article was first published on Sebastian Engel-Wolf blog, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Governments and COVID-19: Which one stops it faster, better, has fewer people dying? These questions get answered with my dashboard.

A contribution to the shiny-contest: https://community.rstudio.com/t/material-design-corona-covid-19-dashboard-2020-shiny-contest-submission/59690

Intro

How did Corona spread? Using the animation feature of R-shiny this can be easily tracked.

COVID-19 is the major topic in all news channels. The place I live in is Munich, Germany. Within weeks Germany moved from 3 patients in the hospital next to my home, to have 20,000 patients. As a data-scientist, I did not only see the numbers but the exponential growth. I wanted to know:

How is the German government performing?
How do other countries stop the disease from spreading?
How long does it take for the disease to spread?
For how long is there exponential growth?
How many people do actually die?

To enable this I got pretty fast using shiny. With shiny you can select countries, date-ranges, make flexible tables with datatable. Great! Additionally, I used plotly to zoom into all plots, get better legends, make it easy to browse through my data. What else… shinymaterial makes the whole app look nice. It’s a great package and comes with easy use on mobile devices. I guess that’s it. Now I can answer all my questions by browsing through the app. It’s easy to see how well South Korea managed Corona for example. You can also see how long it took for people to die in German hospitals, while the outbreak was rather fast in Italy. Moreover, the app shows, that in the US up-to now (Apr 3rd) the spread is not really stopped.

Go to the app to see how your country performs:

If all this Corona data is too much for you, you can also check out the fun data section inside the app.

Implementation

I used the following packages to build the app:

shinymaterial– for the Dashboard construction
plotly– For plotting data
leaflet– To build and animate the app
tidyverse– to clean the data from CSSE
covid19italy – to get some detailed insights into Italy

All code of this App is hosted on github:

To clean the data I mainly wrote a script which does the following:

Clean the Regions for CSSE Data dependent on different dates (encoding was changed 3x in 3 weeks)
Aggregate data per country
Merge data sets for confirmed, deaths, recovered to also compile the active cases
Aggregate per date to visualize data on the map

All this code can be found in data_gen.R

To build up the app I used shiny-modules. How to build modular shiny apps I explained several times already: App – from Truck and Trailer. This time I used standard shiny modules without classes. Each of the pages shown inside the app is such a module. So one for the map, one for the timeline charts, one for Italy….

To render the plots I only used plotly. Plotly allows the user to select certain lines, scroll into the plot and move a round. With few lines of code it is possible to create a line chart which can be grouped and colored per group:

plotly() %>% add_trace(        data = simple_data,        x = ~as.numeric(running_day),        y = ~as.numeric(active),        name = country_name,        text="",        type = if(type == "lines") NULL else type,        line = list(color = palette_col[which(unique(plot_data_intern2$country) == country_name)]))

The result looks like this:

An important feature I wanted to build in was a table, where a lot of measurements per country are available. I set up these measurements:

Maximum time of exponential growth in a row: The number of days a country showed exponential growth (doubling of infections in short time) in a row. This means there was no phase of slow growth or decrease in between.
Days to double infections: This gives the time it took until today to double the number of infections. A higher number is better, because it takes longer to infect more people
Exponential growth today: Whether the countries number of infections is still exponentially growing
Confirmed cases: Confirmed cases today due to the Johns Hopkins CSSE data set
Deaths: Summed up deaths until today due to the Johns Hopkins CSSE data set
Population: Number of people living inside the country
Confirmed cases on 100,000 inhabitants: How many people have been infected if you would randomely choose 100,000 people from this country.
mortality Rate: Percentage of deaths per confirmed case

With the datatable package this table is scrollable and searchable. Even on mobile devices:

Last but not least, I wanted to have a map that changes over time. This was enabled using the leaflet package. leafletProxy enables to add new circles everytime the data_for_display changes. The code for the map would look like this:

leafletProxy(mapId = "outmap") %>%       clearGroup(curr_date()) %>%       addCircles(data = data_for_display,                  lng = ~Long, lat = ~Lat,                  radius = ~active_scaled,                  popup = ~text,                  fillColor = ~color, stroke = FALSE, fillOpacity = 0.5,                  group = stringr::str_match(date_to_choose, "\\d{4}\\-\\d{2}\\-\\d{2}")[1,1]     )

With shiny, the date-slider could easily be animated

shiny::sliderInput(inputId = session$ns("datum"),                   min = as.POSIXct("2020-02-01"),                   max = max(all_dates()),                   value = max(all_dates()),                   step = 86400,                   label = "Date", timeFormat="%Y-%m-%d",                    animate = animationOptions(interval = 200)))

The result is the video from above:

Why R? Webinar on R/exams for E-Learning Quizzes and Beyond

April 3, 2020, 3:00 pm

≫ Next: Another “flatten the COVID-19 curve” simulation… in R

≪ Previous: COVID-19 shiny / plotly dashboard

[This article was first published on R/exams, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Webinar hosted by Why R? Foundation: Slides, video, and e-learning resources.

Why R? Webinar on R/exams for E-Learning Quizzes and Beyond

Why R? Webinar 001

R/exams was presented in the first Why R? webinar by Achim Zeileis. Marcin Kosiński was the host and did a great job setting up the Zoom session that was streamed live on YouTube where more than 100 participants joined the live chat.

Compared to previous presentations on R/exams, the webinar gave some emphasis to e-learning. Also, the presentation shared some experiences from Universität Innsbruck on how R/exams can assist the transition to distance learning due to the corona crisis in spring 2020.

This blog post provides various resources for those who did not attend the webinar or who want to explore the materials in some more detail. Most importantly the presentation slides are available in PDF format (under CC-BY):

Video

Thanks to the great job done by the Why R? organizers, the recording of the live stream is available on YouTube:

E-Learning

To illustrate the e-learning capabilities supported by R/exams, the presentation went through an online quiz generated by R/exams and imported into OpenOLAT, an open-source learning management system (available under the Apache License). The online test is made available again here for anonymous guest access. (Note however, that the system only has one guest user so that when you start the test there may already be some test results from a previous guest session. In that case you can finish the test and also start it again.)

A detailed description of the generation and import of this online quiz is available in the recent R/exams e-learning tutorial.

Getting started

Further resources for getting started with R/exams are available in these tutorials:

To leave a comment for the author, please follow the link and comment on their blog: R/exams.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Another “flatten the COVID-19 curve” simulation… in R

April 3, 2020, 5:16 pm

≫ Next: GARCHery

≪ Previous: Why R? Webinar on R/exams for E-Learning Quizzes and Beyond

[This article was first published on long time ago..., and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Hi there,

This is the best meme I’ve found during these days…

Well, here it is my “BUT” contribution. Some weeks ago The Washington Post published this simulations about how “social distancing” could help to “flat the curve” of COVID-19 infections. I fell in love with these simulations because their simplicity and explanatory power. Indeed, you can use the pretty similar principles to simulate predator hunt behavior and other movement patterns… I wrote some R code to have a tiny version of them…

 library(raster)    library(dismo)        r <- raster(nrows = 100, ncols = 100, xmn = 0, xmx = 100, ymn = 0, ymx = 100)     r[] <- 0        steps <- 500    n <- 100        locations <- data.frame(matrix(ncol = n, nrow = steps))        pp <- randomPoints(r,n)    cell <- cellFromXY(r, pp)    infected <- 1:10    pob <- 1:n    ratio <- data.frame(1:steps, NA)    pob_inf <- list()        for (j in 1:steps){     print(j)     locations[j,] <- cell          for(i in 1:n){            cell[i] <- adjacent(r, cell[i], 8)[round(runif(1,0.51,nrow(adjacent(r, cell[i], 8))+0.49),0),2]           }          new_inf <- cell %in% cell[infected]     infected <- pob[new_inf]     ratio[j,2] <- length(infected)     pob_inf[[j]] <- infected        }        locations2 <- data.frame(matrix(ncol = n, nrow = steps))        cell2 <- cellFromXY(r, pp)    infected2 <- 1:10    pob2 <- 1:n    ratio2 <- data.frame(1:steps, NA)    pob_inf2 <- list()        for (j in 1:steps){     print(j)     locations2[j,] <- cell2          for(i in 1:25){            cell2[i] <- adjacent(r, cell2[i], 8)[round(runif(1,0.51,nrow(adjacent(r, cell2[i], 8))+0.49),0),2]           }          new_inf2 <- cell2 %in% cell2[infected2]     infected2 <- pob2[new_inf2]     ratio2[j,2] <- length(infected2)     pob_inf2[[j]] <- infected2        }

Let’s make some plots to put them together in a GIF and better visualize the results…

 num <- seq(1,500,4)        for (p in 1:125){     id <- sprintf("%03d", num[p])     png(paste("corona_",id,".png", sep=""), width=780, height=800, units="px", pointsize = 15)     layout(matrix(c(1,1,2,3),2,2,byrow = TRUE))     plot(ratio[1:num[p],],pch=19, xlim = c(0,500), ylim = c(0,100), ylab = "nº of infected",         xlab = "time", col = "red", cex.axis = 1.4, cex.lab = 1.4, main = "Infected curves", cex.main = 2)     points(ratio2[1:num[p],],pch=19,col="blue")     legend("topright", legend = c("free movement", "restricted movement"),lwd = c(4,4), col = c("red","blue"), cex = 1.5 )     plot(r, col = "white", legend = FALSE, axes = FALSE, main = "free movement", cex.main = 1.8)     points(xyFromCell(r,as.numeric(locations[num[p],])), cex = 1.2, col = "grey40", pch = 18)     points(xyFromCell(r,as.numeric(locations[num[p],]))[pob_inf[[num[p]]],], pch = 18, col = "red", cex = 1.4)     plot(r, col = "white", legend = FALSE, axes = FALSE, main = "restricted movement", cex.main = 1.8)     points(xyFromCell(r,as.numeric(locations2[num[p],])), cex = 1.2, col = "grey40", pch = 18)     points(xyFromCell(r,as.numeric(locations2[num[p],]))[pob_inf2[[num[p]]],], pch = 18, col = "blue", cex = 1.4)          dev.off()     }

Done!

Stay safe!

To leave a comment for the author, please follow the link and comment on their blog: long time ago....

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

GARCHery

April 3, 2020, 5:00 pm

≫ Next: Caching in R

≪ Previous: Another “flatten the COVID-19 curve” simulation… in R

[This article was first published on R on OSM, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

In our last post, we discussed using the historical average return as one method for setting capital market expectations prior to constructing a satisfactory portfolio. We glossed over setting expectations for future volatility, mainly because it is such a thorny issue. However, we read an excellent tutorial on GARCH models that inspired us at least to take a stab at it. The tutorial hails from the work of Marcelo S. Perlin and colleagues. We highly recommend Mr. Perlin’s blog and note that we’ve also used his simfinR package in our series on valuation, which can be found here, here, and here.

In portfolio construction, estimating expected volatility is as important, if not more so, as estimating expected returns. Risk, as we discussed in Portfolio starter kit and SHARPEn your portfolio, and as quantified by volatility, has a profound impact on allocating assets to construct a satisfactory portfolio. Hence, getting your volatility expectations well-grounded should improve portfolio allocation decisions.

Yet, how do you assign expected volatility? Using the historical, annualized standard deviation of returns is a frequent method, just like using the historical mean return. But the problem is that volatility has different characteristics than returns. One need only look at a long-run graph of the S&P 500’s annualized volatility to see that.

Volatility is, well, volatile. It’s clearly not constant and features periods of relative calm interspersed with gut-wrenching spikes, as anyone who’s been following the stock market recently is well aware. What this suggests is that the aggregating period you use for expected returns and risk should match your investment horizon. If you’re investing for 10 years, you shouldn’t expect to achieve the same results as a 50 year period because volatility can mess with your returns and your head. That is, any period shorter than the long-run will likely see different average returns and volatility, sometimes significantly so, due to the volatility of volatility. Just compare the average return of the 1980-1990 period to that of 2000-2010 vs. the long-term trend in the graph below.

Here’s the rub. As we showed in Mean expectations, there are problems using average returns due to the distribution of those returns and which period is more relevant. These problems apply to volatility but with an additional effect of volatility clustering. That is periods of high volatility tend be followed by high volatility and vice versa. When those volatility regimes switch is unpredictable, yet means you need a sufficiently long horizon to allow returns and volatility to revert to their long-run level. You may enjoy a period of relative calm, only to see a huge spike at the end and suffer far worse returns than expected. Or, it could be the complete opposite. This unpredictability is best seen by comparing two different periods. For example, the period prior to 2000 had very few spikes in volatility above 25% even with Black Monday in 1987. The period after had way more. In fact, as we show in the table below, the period after 2000 saw the frequency of volatility spikes increase by one order of magnitude vs. the period before.

Table 1: Frequency of volatility spikes above 25%

Regime	Frequency (%)
Post-2000	13.3
Pre-2000	3.0

What that implies is, even if your average return estimate is relatively accurate, if volatility is under-estimated, then your end of period return could be significantly different (likely worse, though better is possible too) due to compounding and path dependency. How does one adjust for this with historical data?

Bootstrapping, which we introduced in the previous post, is one way to solve this problem. In this case, your sample size would be the same as your investment horizon. If your horizon is 10 years, you’d repeatedly sample 10-year periods out of the entire series and then average across those samples.¹ This approach might not capture volatility clustering, however. To do that, you could sample in blocks as we did in this post. While this would likely capture the clustering, deciding on block length would be arbitrary without using more sophisticated methods to choose appropriate correlation lags.

A third way is to simulate future returns. Enter the GARCH model, which provides a solid framework to approximate future returns and volatility while accounting for some of the issues we just highlighted.² Note we’ve glossed over giving examples of many of the issues around using historical volatility as a proxy for expected future volatility to focus on GARCH model results. Unfortunately, we also won’t do justice to the model in this post, so please read Marcelo Perlin’s blog, for further details.

We’ll now briefly attempt to explain the GARCH model and simulate returns to outline simulation as an approach to setting volatility expectations. The GARCH model attempts to identify a long-run volatility level while accounting for volatility clustering. That is, it estimates an underlying volatility level, but allows for scenarios in which if there are moves larger than the long-run level, they will tend to be followed by more large moves, This is similar to what one sees historically, as shown in the graph below. Please note this graph along with many of the following charts, analyses, and underlying code were adapted from the code used in the GARCH tutorial and can be found on GitHub. Any errors are ours, however.

As evident in the chart above, large moves in the S&P tend to cluster around major events—Black Monday in 1987, the global financial crisis, and the covid-19 pandemic, most notably, The GARCH model thus attempts to account for mean reversion in volatility back to the long-run level, while still allowing some “memory” of recent volatility to affect the estimates. To arrive at a working model, one first needs to check that the asset class’s volatility is not constant and that it exhibits some serial correlation. Once that is established, one chooses the parameters (which are generally the length of the lags) to produce the model. Or one iterates different parameter combinations and then chooses which set of combinations to use based on statistical criteria that measure how accurately the model describes the real data. We won’t go through any of that now. Instead, we’ll jump to simulating forward returns and volatility, but you can check out those steps in the Appendix below.

For our simulation, we use the monthly returns on the S&P and a GARCH (1,1) model, which means that we’re using a one period lag on the return and volatility data to calculate the model variables. We plot the simulations below, with a line that identifies the former peak.

What’s interesting about this plot, whose code is based on the tutorial, is that there is still a reasonably high probability that the S&P never regains its former peak over the next five years. To our mind, this captures entirely possible scenarios, making the model and simulation more realistic. Recall that it took almost six years for the S&P to exceed the peak preceding the global financial crisis.

Given these simulations, one could then calculate average returns and volatilities to use in setting capital market expectations. Note that this simulation may not include another period of high volatility. Other models attempt to account for that, but are beyond our present discussion.

Let’s rehearse what we’ve learned so far. Trying to produce reasonably robust capital market expectations requires a good estimate of future volatility. If you’re using historical results to set these expectations, then you should use a similar historical time period to match the investment horizon. If you don’t, you could be way off in your estimates, due to the volatility of volatility. Bootstrapping and block sampling can help overcome biases in choosing an historical period on which to base estimates. But these approaches may not adequately account for volatility clustering. A potentially better approach, however, would be to simulate future results using a model designed to capture some of volatility’s known effects. Using a GARCH model is one method and doing so produces results that align with previous historical patterns. But, it is only one model.

In our next post, we’ll return to our regularly scheduled programming where we plan to examine discounted cash flow models to set expectations. Until then, the code behind our analyses follows the Appendix, which goes through some of the same steps as the tutorial in choosing a GARCH model.

Appendix

We run through building the GARCH model as illustrated in the tutorial. The tutorial focuses on the Brazil index, Ibovespa; we use the S&P 500.

Step one: Check data fits an ARCH model

The point of this test is to check whether the data is serially correlated and if variance is non-constant. We present the p-values from the test in the table below. Since they are all below 0.05, that tells us that the data exhibits the effects we’re looking for.

Table 2: S&P 500 Arch test

Lag	Statistic	P-value
1	17.4	0.0
2	20.1	0.0
3	25.4	0.0
4	29.3	0.0
5	29.6	0.0
6	30.1	0.0

Step Two: Iterate model specifications to find good model

Iterate through different combinations of lags on the return and volatility data to find the model that produces the lowest value to two statistics commonly used to measure prediction error: Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC). The tutorial goes into much greater detail than we do. We show the two best models below. We chose to use a GARCH(1,1) in our simulation for purposes of simplicity. We’re not sure we could explain an ARMA (6,6) + GARCH (1,1) model in a short post!

Table 3: GARCH models with lowest information criterion

Criterion	Model	Statistic
AIC	ARMA(6,6)+GARCH(1,1)	-3.66
BIC	ARMA(6,6)+GARCH(1,1)	-3.57

Step three: Simulate

We graphed the simulations above. We show a sample of the results in the table below.

Table 4: Top and bottom five cumulative returns

Simulation (#)	Return (%)
268	232.5
303	237.8
36	245.7
432	247.3
483	254.6
529	283.9
450	-28.8
772	-23.6
447	-15.3
439	-15.0
361	-14.8

Using the simulations, one can create a cumulative probability distribution of exceeding the former peak by a certain date (another neat chart from the tutorial). We find that even after five years there’s still around a 20% chance the S&P won’t have surpassed it’s former peak. We’re not sure that’s the right probability, but it’s reasonable given past history.

Of course, if we had used daily instead of monthly returns and volatility, the probabilities would have gone up. This is an artifact of the time series—namely, the annualized volatility of the monthly returns is less than that of daily ones. While it is beyond the scope of this post to explain why that is the case, the simple explanation is that there is less noise in the monthly data. Nonetheless, the point is that with a lower volatility, expected moves are lower, thus probability of exceeding some specific level will be lower too.

We have tried to reproduce the results from the tutorial faithfully, albeit with different data. Any errors are ours.

Here’s the code.

## We imported many of the functions found on the github page for the tutorial. ## Please reference that page for the full code. These functions are preceded ## by source('garch_fcts.R')## NB: Often one will see the dplyr package called directly to get the filter()## function. This is because one of other packages masks the dplyr function.### Load packagessuppressPackageStartupMessages({  library(tidyquant)  library(tidyverse)  library(FinTS)  library(fGarch)})options("getSymbols.warning4.0"=FALSE)### Load datasp <- getSymbols("^GSPC", src = "yahoo", from = "1950-01-01", auto.assign = FALSE) %>%   Ad() %>%   `colnames<-`("price")sp <- data.frame(date = index(sp), price = as.numeric(sp$price)) %>%   mutate(log_ret = log(price/dplyr::lag(price)),         disc_ret = price/dplyr::lag(price)-1)sp_mon <- sp %>%   mutate(year_mon = as.yearmon(date)) %>%   group_by(year_mon) %>%   dplyr::filter(date == max(date)) %>%   ungroup() %>%   mutate(log_ret = log(price/dplyr::lag(price)),         disc_ret = price/dplyr::lag(price)-1) %>%   select(-year_mon)## Plot annualized volatilitysp %>%   mutate(vol = rollapply(log_ret, width = 22, sd, align = "right", fill = NA)*sqrt(252)) %>%   ggplot(aes(date, vol*100)) +  geom_line(color = "blue") +  labs(x = "",       y = "Volatility (%)",       title = "Annualized rolling 22-day volatility of the S&P 500")## Plot log chart of S&Psp %>%   ggplot(aes(date, price)) +  geom_line(color = "blue") +  scale_y_log10() +  labs(x = "",       y = "Index",       title ="S&P 500 long-run chart with y-axis on log-scale")## Table of volatility spike frequencysp %>%   mutate(vol = rollapply(log_ret, width = 22, sd, align = "right", fill = NA)*sqrt(252)) %>%   mutate(Regime = ifelse(date < "2000-01-01", "Pre-2000", "Post-2000")) %>%   group_by(Regime) %>%   summarise("Frequency (%)" = round(sum(vol > 0.25, na.rm = TRUE)/n()*100,1)) %>%   knitr::kable(caption = "Frequency of volatility spikes above 25%")## Find largest 20 moveslargest <- sp %>%   dplyr::filter(date >= "1980-01-01") %>%   top_n(abs(log_ret), n = 20)## Plot volatility clusteringsp %>%   dplyr::filter(date >= "1980-01-01") %>%   ggplot(aes(date, log_ret)) +   geom_line(color = "blue") +   labs(title = "S&P 500 daily log returns",       subtitle = "Red dots are the 20 largest absolute daily price changes",       x = '',       y = 'Returns',       caption = 'Based on code written by MS Perlin') +   geom_point(data = largest,              aes(date, log_ret),              size = 3, color = 'red'  ) +  geom_text(aes(x = as.Date("2015-01-01"),                 y = -0.22,                 label = "Large moves cluster"),            color = "red") +  geom_segment(aes(x = as.Date("2015-01-01"), xend = as.Date("2009-01-01"),                   y = -0.2, yend = -0.12),               arrow = arrow(length = unit(2, "mm")),               color = "red") +   geom_segment(aes(x = as.Date("2015-06-01"), xend = as.Date("2020-02-01"),                   y = -0.2, yend = -0.12),               arrow = arrow(length = unit(2, "mm")),               color = "red") +   scale_y_continuous(labels = scales::percent) +  theme(plot.caption = element_text(hjust = 0))## Run simulationset.seed(20200320) n_sim <- 1000 n_periods <- 72# source functionssource('garch_fcts.R') # for sim_func. We altered this to include different simulation# perods; e.g., days, months, quarters, etc.df_sim <- sim_func(n_sim = n_sim,                  n_t = n_days_ahead, my_garch, "months", df_prices = sp_mon)sp_temp <- sp_mon %>%   dplyr::filter(date >= "2010-01-03") #dplyr::filter masked by some other package!ggplot() +   geom_line(data = sp_temp,             aes(x = date, y = price), color = 'blue', size = 0.75)  +   geom_line(data = df_sim, aes(x = ref_date, y = sim_price, group = i_sim),            color = 'lightgrey',             size = 0.35) +   geom_hline(yintercept = max(sp_temp$price), color = "red", size = 0.35) +   labs(title = 'GARCH model projections for the S&P 500',       subtitle = 'Based on 1000 simulations',       x = '',       y = 'Index',       captioin = "Based on code written by MS Perlin") +   ylim(c(min(sp_temp$price), max(df_sim$sim_price))) +  scale_y_continuous(labels = scales::comma) +  theme(plot.caption = element_text(hjust = 0))# ARCH test function from MS Perlinsource('garch_fcts.r')# Run test up to six-month lagmax_lag <- 6arch_test <- do_arch_test(x = sp_mon$log_ret, max_lag = max_lag)arch_test %>%   mutate(statistic = round(statistic,1),          pvalue = format(round(pvalue,2),nsmall=1 )) %>%   rename("Lag" = lag,         "Statistic" = statistic,         "P-value" = pvalue) %>%   knitr::kable(caption = "S&P 500 Arch test",                align = c("l", "r", "r"))tab_out <- out$tab_outsp_long <- pivot_longer(data = tab_out %>%                                 select(model_name, AIC, BIC),  cols = c('AIC', 'BIC'))sp_best_models <- sp_long %>%  group_by(name) %>%  summarise(model_name = model_name[which.min(value)],            value = value[which.min(value)])models_names <- unique(sp_long$model_name)best_model <- c(tab_out$model_name[which.min(tab_out$AIC)],                 tab_out$model_name[which.min(tab_out$BIC)])best_model %>%   mutate(value = round(value,2)) %>%   rename("Criterion" = name,         "Model" = model_name,         "Statistic" = value) %>%   knitr::kable(caption = "GARCH models with lowest information criterion")df_sim %>%   group_by(i_sim) %>%   summarise(tot_ret = round((last(sim_price)/last(sp_mon$price)-1)*100,1)) %>%   arrange(tot_ret) %>%   slice(c(1:5, (n()-5):n())) %>%   rename("Simulation (#)" = i_sim,         "Return (%)" = tot_ret) %>%   knitr::kable(caption = "Top and bottom five cumulative returns")tab_prob <- df_sim %>%  group_by(ref_date) %>%  summarise(prob = mean(sim_price > max(sp$price)))df_date <- tibble(idx = c(first(which(tab_prob$prob > 0.25)),                          first(which(tab_prob$prob > 0.5)),                          first(which(tab_prob$prob > 0.75))),                  ref_date = tab_prob$ref_date[idx],                  prob = tab_prob$prob[idx],                  my_text = format(ref_date, '%b %Y'))tab_prob %>%   ggplot(aes(x = ref_date, y = prob) ) +   geom_line(size = 1.25, color = "blue") +   labs(title = paste0('Probabilities of S&P 500 reaching former peak of ',                       round(max(sp$price),0)),       subtitle = paste0('Calculations based on GARCH model simulations'),       x = '',       y = 'Probability') +   scale_y_continuous(labels = scales::percent) +   geom_point(data = df_date,             aes(x = ref_date, y = prob), size = 3.5, color = 'red') +   geom_text(data = df_date, aes(x = ref_date, y = prob,                                 label = my_text),             nudge_x = -200,            # nudge_y = -0.05,            color ='red', check_overlap = TRUE)

It’s not clear whether this should be done with or without replacement. We can come up with arguments for both. Send us an email at nbw dot osm at gmail if you have a strong view.
It could also accommodate path dependency, but we’ll save that discussion for another time.

To leave a comment for the author, please follow the link and comment on their blog: R on OSM.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Caching in R

April 4, 2020, 12:00 am

≫ Next: D is for dummy_cols

≪ Previous: GARCHery

[This article was first published on Posts | Joshua Cook, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Introduction

Caching intermediate objects in R can be an efficient way to avoid re-evaluating long-running computations. The general process is always the same: run the chunk of code once, store the output to disk, and load it up the next time the same chunk is run. There are, of course, multiple packages in R to help with this process, so I’ve decided to outline some of the more popular options below.

One of the most important features of any caching system is its ability to detect if the cache has become “stale,” that is, when the object on disk is no longer valid because the dependencies of the cached object have changed. This feature is specifically discussed in the sections for each caching method, but, briefly, there are systems for cache invalidation in R Markdown, ‘R.cache’, ‘mustashe,’ and ‘ProjectTemplate.’

Options

Here are the options for caching in R that I will discuss below, and each has a link to more information on that specific option:

TL;DR

For my final synopsis on when to use each package, skip to the Conclusion.

Caching a code chunk in R Markdown

R Markdown has a built-in caching feature that can be enabled by setting cache=TRUE in the chunk’s header.

```{r import-df, cache=TRUE}
df <- read_tsv("data-file.tsv")
```

The second time the chunk is run, both the visual output and any objects created are loaded from disk. If you are already using R Markdown for your project or work, this is probably the only caching mechanism you will need.

R Markdown does have a method for detecting cache invalidation, though it is not explicitly supported by ‘knitr.’ The basic idea is to set another chunk option that computes some value that, if it changes, should trigger cache invalidation. For instance, say we are reading in a file from disk and want the chunk to re-run if it changes. We can create a new chunk option called cache.extra and assign it some value to indicate if the file has changed, such as the modification date.

```{r import-df, cache=TRUE, cache.extra=file.mtime("data-file.tsv")}
df <- read_tsv("data-file.tsv")
```

Now if the file is modified, the cache for the code chunk will be invalidated and the code will be re-run.

‘memoise’

The ‘memoise’ package brings in the function memoise(). When a function is “memoised,” the inputs and outputs are remembered so that if a function is passed the same inputs multiple times, the previously computed output can be returned immediately, without re-evaluating the function call. This is an optimization technique from dynamic programming.

The memoise() function is passed a function and returns a new function with the same properties as the original, except it is now memoised (it returns TRUE when passed to is.memoised()). Below is an example where sq(), a simple function that squares its input, is memoised as memo_sq(). A print statement is included in the sq() function to indicate when it has actually been run.

library(memoise)
sq <- function(x) {
print("Computing square of 'x'")
x**2
}
memo_sq <- memoise(sq)

The first time memo_sq(2) is run, the function is evaluated and we see the print statement’s message.

memo_sq(2)

#> [1] "Computing square of 'x'"
#> [1] 4

However, the second time, the result is loaded from disk and we see no message.

memo_sq(2)

#> [1] 4

Optionally, a local directory, AWS S3 bucket, or Google Cloud Storage location can be passed as the location to save the cached data (i.e. paired inputs and outputs). This can be useful for storing the memoised values across multiple R sessions.

As far as I am aware, there is no cache invalidation feature in the ‘memoise’ package. In other words, if I were to change sq() to return the cube of the input, memo_sq() would not be automatically updated or alerted in any way.

sq <- function(x) {
x**3
}
sq(2)

#> [1] 8

memo_sq(2)

#> [1] 4

In fairness, caching is not the intended purpose of memoisation, but it is a practical use case, so I think it is still worth including in this article.

‘R.cache’

The documentation for ‘R.cache’ is limited, but from what I can figure out, it implements memoisation while also linking to dependencies for cache invalidation. Further, and the main distinguishing feature between this package and ‘memoise’, ‘R.cache’ memoises an expression, not just a function.

The primary function of ‘R.cache’ is evalWithMemoization(). It takes an expression to be evaluated, evaluates the expression, and stores both the created object, a in this case, and the expression itself.

suppressPackageStartupMessages(library(R.cache))
evalWithMemoization({
print("Evaluating expression.")
a <- 1
})

#> [1] "Evaluating expression."
#> [1] 1

#> [1] 1

Now the second time the expression is evaluated, there is no print message because the result is loaded from disk.

library(R.cache)
evalWithMemoization({
print("Evaluating expression.")
a <- 1
})

#> [1] 1

Dependencies can be declared for the memoised expression by passing one or more objects to the key parameter. For example, the object b is listed as a key for the following expression.

b <- 1
evalWithMemoization(
{
print("Evaluating expression.")
a <- 100 + b
},
key = b
)

#> [1] "Evaluating expression."
#> [1] 101

If b doesn’t change, then the expression is not re-evaluated.

evalWithMemoization(
{
print("Evaluating expression.")
a <- 100 + b
},
key = b
)

#> [1] 101

However, if b changes, then the expression is evaluated again.

b <- 2
evalWithMemoization(
{
print("Evaluating expression.")
a <- 100 + b
},
key = b
)

#> [1] "Evaluating expression."
#> [1] 102

While this package has many desirable features for caching, there are some design choices that I do not like. To begin, I am not a huge fan of this package’s API including the function naming scheme and how the keys are passed after the expression. Further, I do not like how the final result of the expression is automatically returned, I would prefer this be returned invisibly if anything. Also, I don’t like that the default location for the caching directory is /Users/admin/Library/Caches/R/R.cache, I would prefer it be a hidden directory in the project’s root directory. Finally, the evaluated expression is not invariant to stylistic changes to the expression. For instance, if the assignment arrow <- is changed to an =, the expression is re-evaluated.

evalWithMemoization({
print("Evaluating expression.")
a = 1
})

#> [1] "Evaluating expression."
#> [1] 1

For these reasons, I created the ‘mustashe’ package, demonstrated next.

‘mustashe’

I have recently described ‘mustashe’ in two previous posts (an introduction to ‘mustashe’ and ‘mustashe’ Explained), so I will keep the description here brief.

The stash() function takes a name of the stashed value, an expression to evaluate, and any dependencies.

library(mustashe)
x <- 1
stash("y", depends_on = "x", {
print("Calculating 'y'")
y <- x + 1
})

#> Updating stash.
#> [1] "Calculating 'y'"

# Value of `y`
y

#> [1] 2

Just like ‘R.cache,’ if the value of the dependency x changes, then the code is re-evaluated.

# Change the value of a dependency of `y`.
x <- 2
stash("y", depends_on = "x", {
print("Calculating 'y'")
y <- x + 1
})

#> Updating stash.
#> [1] "Calculating 'y'"

However, ‘mustashe’ handles stylistic changes to the expression better than ‘R.cache’. For instance, if the same code was instead typed by a madman, ‘mustashe’ would still not re-run the code chunk.

stash("y", depends_on = "x", {
print( "Calculating 'y'" )
y = x + 1
# Add a new comment!
})

#> Loading stashed object.

Overall, ‘mustashe’ and ‘R.cache’ are very similar, and the main differences are stylistic.

‘DataCache’

I won’t discuss the ‘DataCache’ package extensively because I personally have little use for it. It has already been explained by the author on a previous R-Blogger’s post, ‘Data Caching’, so if you are interested, I recommend reading that article. Also, it is not on CRAN nor actively maintained on GitHub. In general it is intended to periodically load data from an external source. The idea is the the data is dynamic and frequently updated. The ‘DataCache’ package sets a timer for the data and reads in the most recent version at set periods.

‘ProjectTemplate’

The ‘ProjectTemplate’ package is far more than a caching system, rather, it is a data analysis project framework. The caching system is merely a part of it. However, the entire framework must be adopted in order to use its caching system (there is a basic explanation of why in ‘mustashe’ Explained – Why not use ’ProjectTemplate’s cache() function?). For this reason, I will not provide an in depth preview of their system, but just provide the following example. (Note, the API is very similar to that used by ‘mustashe’ because it was the inspiration for that package.)

cache("foo", depends = c("a", "b"), {
x <- loaded_data$name
x <- as.character(x)
c(x[[1]], a, b)
})

Conclusion

Here are my recommendations for what caching system to use, in order of precedence:

If you just want memoisation for its intended purpose (i.e. avoid repetitive calculations), use the ‘memosie’ package.
If using the ‘ProjectTemplate’ framework, then use its built in caching system.
If you are using an R Markdown file, then use the chunk caching feature.
For all other caching needs, choose between ‘mustashe’ and ‘R.cache’ (I prefer using ‘mustashe’, but I am biased).

var vglnk = { key: '949efb41171ac6ec1bf7f206d57e90b8' }; (function(d, t) { var s = d.createElement(t); s.type = 'text/javascript'; s.async = true; s.src = '//cdn.viglink.com/api/vglnk.js'; var r = d.getElementsByTagName(t)[0]; r.parentNode.insertBefore(s, r); }(document, 'script'));

To leave a comment for the author, please follow the link and comment on their blog: Posts | Joshua Cook.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

D is for dummy_cols

April 4, 2020, 7:00 am

≫ Next: On the “correlation” between a continuous and a categorical variable

≪ Previous: Caching in R

[This article was first published on Deeply Trivial, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

For the letter D, I’m going to talk about the dummy_cols functions, which isn’t actually part of the tidyverse, but hey: my posts, my rules. This function is incredibly useful for creating dummy variables, which are used in a variety of ways, including multiple regression with categorical variables. When conducting linear regression, the assumption is that both the predictor and outcome variables are numeric. To include categorical variables, you need to convert them to numeric variables. If they aren’t strictly continuous, then you instead create dummy variables to represent the different categories. If I had three levels on a categorical variable, I’d need 2 dummy variables: one to delineate category 1 from the other 2, and another to delineate category 2 (with the third category being represented by 0s on the other two variables).

There are, of course, other uses for dummy variables. For instance, at work, I was examining unique users of our testing system by time of day. Our system creates a row for every action by the user, with a time stamp. If I simply generated counts of these rows during spans of time, I would get a count of actions per hour by users (clicks, highlights, etc.), rather than individual users logged in during a given hour. So I created dummy codes by hour of day, then aggregated by unique user identifier. This was how I could generate accurate counts of how many users were online during a given hour.

To apply this procedure to the reading dataset, I used the dummy_cols function to create dummy variables (or flags) for genre. I created a long-form dataset of the top genres for each title, which you can download here. For simplicity, this file only contains Book.ID, title, and genre (with a separate entry for each genre, so some books have a single row, for one genre, and others have multiple rows, to reflect multiple genres).

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

##  ggplot2 3.2.1      purrr   0.3.3 ##  tibble  2.1.3      dplyr   0.8.3 ##  tidyr   1.0.0      stringr 1.4.0 ##  readr   1.3.1      forcats 0.4.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() -- ## x dplyr::filter() masks stats::filter() ## x dplyr::lag()    masks stats::lag()

longreads2019<-read_csv("~/Downloads/Blogging A to Z/reads2019_long.csv")

## Parsed with column specification: ## cols( ##   Book.ID = col_double(), ##   Title = col_character(), ##   genre = col_character() ## )

I can use the dummy_cols functions to create the genres flags, that I can aggregate down and merge into the reads2019 file (I’ve created a version without genre flags, available here). For this function, you’ll need the fastDummies package (so add install.packages(“fastDummies”) before the rest of the code). Also, since the number of dummy code variables typically are equal to the number of categories minus 1, the function automatically removes the first dummy variable from the final file. Since I’m using these as flags rather than dummy variables, I want to overide that default, which I do with remove_first_dummy = FALSE.

library(fastDummies)genres<-longreads2019%>%dummy_cols(select_columns="genre",remove_first_dummy=FALSE)genres<-genres%>%group_by(Book.ID)%>%summarise(Fiction=max(genre_Fiction),Childrens=max(genre_Childrens),Fantasy=max(genre_Fantasy),SciFi=max(genre_SciFi),Mystery=max(genre_Mystery),SelfHelp=max(genre_SelfHelp))reads2019<-read_csv("~/Downloads/Blogging A to Z/ReadsNoGenre.csv",col_names=TRUE)

## Parsed with column specification: ## cols( ##   Title = col_character(), ##   Pages = col_double(), ##   date_started = col_character(), ##   date_read = col_character(), ##   Book.ID = col_double(), ##   Author = col_character(), ##   AdditionalAuthors = col_character(), ##   AverageRating = col_double(), ##   OriginalPublicationYear = col_double(), ##   read_time = col_double(), ##   MyRating = col_double(), ##   Gender = col_double(), ##   NewRating = col_double(), ##   FinalRating = col_double() ## )

reads2019<-reads2019%>%left_join(genres,by="Book.ID")

I know I’ve sprinkled in other tidyverse functions in these posts, such as group_by and summarise. Don’t worry! I’ll post more about those functions in this series – stay tuned!

To leave a comment for the author, please follow the link and comment on their blog: Deeply Trivial.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

On the “correlation” between a continuous and a categorical variable

April 4, 2020, 4:39 pm

≫ Next: Mapping Covid-19 cases: a Shiny app

≪ Previous: D is for dummy_cols

[This article was first published on R-english – Freakonometrics, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Let us get back on the Titanic dataset,

1234	loc_fichier ="http://freakonometrics.free.fr/titanic.RData"download.file(loc_fichier, "titanic.RData")load("titanic.RData")base = base[!is.na(base$Age),]

On consider two variables, the age x (the continuous one) and the survivor indicator y (the qualitative one)

12	X = base$AgeY = base$Survived

It looks like the age might be a valid explanatory variable in the logistic regression,

1234567891011121314

summary(glm(Survived~Age,data=base,family=binomial)) Coefficients:            Estimate Std. Error z value Pr(&gt;|z|)(Intercept)-0.056720.17358-0.3270.7438  Age         -0.010960.00533-2.0570.0397*---Signif. codes:0‘***’ 0.001‘**’ 0.01‘*’ 0.05‘.’ 0.1‘ ’ 1 (Dispersion parameter forbinomialfamily taken to be 1)     Null deviance:964.52  on 713  degrees of freedomResidual deviance:960.23  on 712  degrees of freedomAIC:964.23

The significance test here has a p-value just below 4%. Actually, one can relate it with the value of the deviance (the null deviance and the residual deviance). Recall thatD=2\big(\log\mathcal{L}(\boldsymbol{y})-\log\mathcal{L}(\widehat{\boldsymbol{\mu}})\big)whileD_0=2\big(\log\mathcal{L}(\boldsymbol{y})-\log\mathcal{L}(\overline{y})\big)Under the assumption that x is worthless, D_0-D tends to a \chi^2 distribution with 1 degree of freedom. And we can compute the p-value dof that likelihood ratio test,

12	1-pchisq(964.52-960.23,1)[1]0.03833717

(which is consistent with a Gaussian test). But if we consider a nonlinear transformation

123456789101112131415

summary(glm(Survived~bs(Age),data=base,family=binomial)) Coefficients:            Estimate Std. Error z value Pr(&gt;|z|)(Intercept)0.86480.34602.5000.012433*  bs(Age)1-3.67721.0458-3.5160.000438***bs(Age)21.74301.10681.5750.115299    bs(Age)3-3.92511.4544-2.6990.006961**---Signif. codes:0‘***’ 0.001‘**’ 0.01‘*’ 0.05‘.’ 0.1‘ ’ 1 (Dispersion parameter forbinomialfamily taken to be 1)     Null deviance:964.52  on 713  degrees of freedomResidual deviance:948.69  on 710  degrees of freedom

which seems to be “more significant”

12	1-pchisq(964.52-948.69,3)[1]0.001228712

So it looks like the variable x is interesting here.

To visualize the non-null correlation, one can consider the condition distribution of x given y=1, and compare it with the condition distribution of x given y=0,

ks.test(X[Y==0],X[Y==1]) Two-sample Kolmogorov-Smirnov test data:  X[Y ==0] and X[Y ==1]D=0.088777, p-value =0.1324alternative hypothesis: two-sided

i.e. with a p-value above 10%, the two distributions are not significatly different.

F0 =function(x)mean(X[Y==0]&lt;=x)F1 =function(x)mean(X[Y==1]&lt;=x)vx =seq(0,80,by=.1)vy0 =Vectorize(F0)(vx)vy1 =Vectorize(F1)(vx)plot(vx,vy0,col="red",type="s")lines(vx,vy1,col="blue",type="s")

(we can also look at the density, but it looks like that there is not much to see)

An alternative is discretize variable x and to use Pearson’s independence test,

123456789101112131415161718

k=5LV =quantile(X,(0:k)/k)LV[1]=0Xc =cut(X,LV)table(Xc,Y)           YXc           01(0,19]8579(19,25]9245(25,31.8]7750(31.8,41]8163(41,80]8953chisq.test(table(Xc,Y)) Pearson's Chi-squared test data:  table(Xc, Y)X-squared = 8.6155, df = 4, p-value = 0.07146

The p-value is here 7%, with five categories for the age. And actually, we can compare the p-value

123456789

pvalue =function(k=5){LV =quantile(X,(0:k)/k)LV[1]=0Xc =cut(X,LV)chisq.test(table(Xc,Y))$p.value}vk =2:20vp =Vectorize(pvalue)(vk)plot(vk,vp,type="l")abline(h=.05,col="red",lty=2)

which gives a p-value close to 5%, as soon as we have enough categories. In the slides of the course (STT5100), I claim that actually, the age is an important variable when trying to predict if a passenger survived. Test mentioned here are not as conclusive, nevertheless…

To leave a comment for the author, please follow the link and comment on their blog: R-english – Freakonometrics.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

Mapping Covid-19 cases: a Shiny app

April 4, 2020, 9:34 pm

≫ Next: SR2 Chapter 3 Medium

≪ Previous: On the “correlation” between a continuous and a categorical variable

[This article was first published on R – CoolStatsBlog, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

R lets you create charts and graphs in image form. But the Shiny package lets you create those same charts and graphs in interactive format. I created my first Shiny chart: a world map of confirmed Covid-19 cases. Check it out here.

Unfortunately I cannot embed the app into this website right now, so the below is merely a screenshot. Click the link to play with the app itself.

Screenshot 2020-04-05 at 12.31.34

Abbas Keshvani

To leave a comment for the author, please follow the link and comment on their blog: R – CoolStatsBlog.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

↧

SR2 Chapter 3 Medium

April 4, 2020, 5:00 pm

≫ Next: patientcounter

≪ Previous: Mapping Covid-19 cases: a Shiny app

[This article was first published on Brian Callander, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

SR2 Chapter 3 Medium

Posted on 5 April, 2020 by Brian Tags: statistical rethinking, solutions, grid approximation, posterior probability, posterior predictive probability, hpdi, binomial Category: statistical-rethinking-2

Here’s my solution to the medium exercises in chapter 3 of McElreath’s Statistical Rethinking, 2nd edition.

$\DeclareMathOperator{\dbinomial}{Binomial} \DeclareMathOperator{\dbernoulli}{Bernoulli} \DeclareMathOperator{\dpoisson}{Poisson} \DeclareMathOperator{\dnormal}{Normal} \DeclareMathOperator{\dt}{t} \DeclareMathOperator{\dcauchy}{Cauchy} \DeclareMathOperator{\dexponential}{Exp} \DeclareMathOperator{\duniform}{Uniform} \DeclareMathOperator{\dgamma}{Gamma} \DeclareMathOperator{\dinvpamma}{Invpamma} \DeclareMathOperator{\invlogit}{InvLogit} \DeclareMathOperator{\logit}{Logit} \DeclareMathOperator{\ddirichlet}{Dirichlet} \DeclareMathOperator{\dbeta}{Beta}$

Assuming Earth has 70% water cover, and we observe water 8 times out of 15 globe tosses, let’s calculate some posterior quantities with two choices of prior: uniform and step.

p_true <-0.7W <-8N <-15granularity <-1000# points on the grid

Uniform Prior

We calculate the grid approximation of the posterior as shown in the book.

m1_grid <-tibble(p =seq(0, 1, length.out = granularity)) %>%mutate(prior =1)m1_posterior <-m1_grid %>%mutate(    likelihood =dbinom(W, N, p),    posterior = prior *likelihood  )

Solution to exercise 3M1

We can get draws from our posterior by sampling the water cover values many times with replacement, each value being drawn in proportion to the posterior probability. We can then just summarise these draws to get the desired interval.

m2_samples <-m1_posterior %>%sample_n(10000, replace = T, weight = posterior)m2_hpdi <-HPDI(m2_samples$p, prob =0.9)m2_hpdi

     |0.9      0.9| 0.3223223 0.7097097

The histogram looks as follows. This is much the same as the previous graph, but calculated from the samples.

Solution to exercise 3M2

To get the posterior predictive sample, we take our posterior draws of $p$, then use them to draw a random number of observed water tosses out of 15. The fraction of posterior predictive samples with a given value is then the posterior predictive probability of that value.

m3_prob <-m2_samples %>%mutate(W =rbinom(n(), 15, p)) %>%group_by(W) %>%tally() %>%mutate(probability = n /sum(n))

Solution to exercise 3M3

We can also calculate the posterior predictive probabilities with a different number of tosses. Here with 9 tosses.

m4_prob <-m2_samples %>%mutate(W =rbinom(n(), 9, p)) %>%group_by(W) %>%tally() %>%mutate(probability = n /sum(n))

Solution to exercise 3M4

Step Prior

Now we repeat the same steps but with the step prior instead of the uniform prior. We’ll just repeat it without comment.

m5_grid <-m1_grid %>%mutate(prior =if_else(p <0.5, 0, 1))m5_posterior <-m5_grid %>%mutate(    likelihood =dbinom(W, N, p),    posterior = prior *likelihood  )

Solution to exercise 3M5 part 1

m5_samples <-m5_posterior %>%sample_n(10000, replace = T, weight = posterior)m5_hpdi <-HPDI(m5_samples$p, prob =0.9)m5_hpdi

     |0.9      0.9| 0.5005005 0.7107107

Solution to exercise 3M5 part 2

m5_prob <-m5_samples %>%mutate(W =rbinom(n(), 15, p)) %>%group_by(W) %>%tally() %>%mutate(probability = n /sum(n))

Solution to exercise 3M5 part 3

m5_prob <-m5_samples %>%mutate(W =rbinom(n(), 9, p)) %>%group_by(W) %>%tally() %>%mutate(probability = n /sum(n))

Solution to exercise 3M5 part 4

Let’s compare the proportion of samples within 0.05 of the true value for each prior.

p_close_uniform <-m2_samples %>%group_by(close = p %>%between(p_true -0.05, p_true +0.05)) %>%tally() %>%mutate(probability = n /sum(n)) %>%filter(close) %>%pull(probability)p_close_step <-m5_samples %>%group_by(close = p %>%between(p_true -0.05, p_true +0.05)) %>%tally() %>%mutate(probability = n /sum(n)) %>%filter(close) %>%pull(probability)

The probability of being close to the true value under the uniform and step priors is 0.1316 and 0.2157, respectively. The step prior thus has more mass around the true value.

Exercise 3M6

Bayesian models are generative, meaning we can simulate new datasets according to our prior probabilities. We’ll simulate 10 datasets for each value of N of interest. We simulate a dataset by randomly choosing a p_true from our uniform prior, then randomly choosing a W from the corresponding binomial distribution.

m6_prior_predictive <-crossing(    N =200*(1:16),     iter =1:10  ) %>%mutate(    p_true =runif(n(), min=0, max=1),     W =rbinom(n(), N, p_true)  )

For each of these simulated datasets, we grid approximate the posterior, take posterior samples, then calculate the HPDI.

m6_grid <-tibble(p =seq(0, 1, length.out = granularity)) %>%mutate(prior =1)m6_posteriors <-m6_prior_predictive %>%crossing(m6_grid) %>%group_by(N, p_true, iter) %>%mutate(    likelihood =dbinom(W, N, p),    posterior = prior *likelihood  )m6_samples <-m6_posteriors %>%sample_n(1000, replace =TRUE, weight = posterior) m6_hpdi <-m6_samples %>%summarise(lo =HPDI(p, 0.99)[1], hi =HPDI(p, 0.99)[2]) %>%mutate(width =abs(hi -lo))

Now for each value of N, we check how many of the intervals have the desired width.

m6_n <-m6_hpdi %>%group_by(N) %>%summarise(fraction =mean(width <0.05))

Solution to exercise 3M6

Thus we expect a sample size around 2600-3000 to give us a sufficiently precise posterior estimation.

/** * RECOMMENDED CONFIGURATION VARIABLES: EDIT AND UNCOMMENT THE SECTION BELOW TO INSERT DYNAMIC VALUES FROM YOUR PLATFORM OR CMS. * LEARN WHY DEFINING THESE VARIABLES IS IMPORTANT: https://disqus.com/admin/universalcode/#configuration-variables *//* var disqus_config = function () { this.page.url = PAGE_URL; // Replace PAGE_URL with your page's canonical URL variable this.page.identifier = PAGE_IDENTIFIER; // Replace PAGE_IDENTIFIER with your page's unique identifier variable }; */(function() { // DON'T EDIT BELOW THIS LINE var d = document, s = d.createElement('script'); s.src = '//stappit-github-io.disqus.com/embed.js'; s.setAttribute('data-timestamp', +new Date()); (d.head || d.body).appendChild(s); })();

Please enable JavaScript to view the comments powered by Disqus.

To leave a comment for the author, please follow the link and comment on their blog: Brian Callander.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.