Erste Schritte mit `vroom`

Nach readr kommt vroom. In der Zwischenzeit liegt vroom in der Version 1.2.0 vor und daher habe ich mir ein paar Stunden Zeit gekommen um ein paar erste Experimente damit zu machen.

Erste Schritte mit vroom

library(vroom)

# URL für die Quelle von tips.csv:
url <- "https://goo.gl/whKjnl"

# Locale auf Deutsche Sprache, Dezimalkomma und Gruppierungspunkte setzen
mylocale <- locale("de", decimal_mark = ",", grouping_mark = ".")

# Spaltentypen ggf. vorgeben:
mycols <- cols(
            col_number(),  # total_bill
            col_number(),  # tip
            col_factor(),  # sex
            col_factor(),  # smoker
            col_factor(),  # day
            col_factor(),  # time
            col_integer()  # size
)
# Laden mit vroom, Spaltentypen erraten, Locale auf mylocale
tips.vroom <- vroom(url, locale = mylocale)
## Rows: 244 Columns: 7
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (4): sex, smoker, day, time
## dbl (3): total_bill, tip, size
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tips.vroom)
## # A tibble: 6 × 7
##   total_bill   tip sex    smoker day   time    size
##        <dbl> <dbl> <chr>  <chr>  <chr> <chr>  <dbl>
## 1       17.0  1.01 Female No     Sun   Dinner     2
## 2       10.3  1.66 Male   No     Sun   Dinner     3
## 3       21.0  3.5  Male   No     Sun   Dinner     3
## 4       23.7  3.31 Male   No     Sun   Dinner     2
## 5       24.6  3.61 Female No     Sun   Dinner     4
## 6       25.3  4.71 Male   No     Sun   Dinner     4
str(tips.vroom)
## spec_tbl_df [244 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ total_bill: num [1:244] 17 10.3 21 23.7 24.6 ...
##  $ tip       : num [1:244] 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : chr [1:244] "Female" "Male" "Male" "Male" ...
##  $ smoker    : chr [1:244] "No" "No" "No" "No" ...
##  $ day       : chr [1:244] "Sun" "Sun" "Sun" "Sun" ...
##  $ time      : chr [1:244] "Dinner" "Dinner" "Dinner" "Dinner" ...
##  $ size      : num [1:244] 2 3 3 2 4 4 2 4 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   total_bill = col_double(),
##   ..   tip = col_double(),
##   ..   sex = col_character(),
##   ..   smoker = col_character(),
##   ..   day = col_character(),
##   ..   time = col_character(),
##   ..   size = col_double(),
##   ..   .delim = ";"
##   .. )
##  - attr(*, "problems")=<externalptr>
object.size(tips.vroom)
## 20632 bytes
# Laden mit vroom, Spaltentypen mycols, Locale auf mylocale
tips.vroom2 <- vroom(url, col_types = mycols, locale = mylocale)

head(tips.vroom2)
## # A tibble: 6 × 7
##   total_bill   tip sex    smoker day   time    size
##        <dbl> <dbl> <fct>  <fct>  <fct> <fct>  <int>
## 1       17.0  1.01 Female No     Sun   Dinner     2
## 2       10.3  1.66 Male   No     Sun   Dinner     3
## 3       21.0  3.5  Male   No     Sun   Dinner     3
## 4       23.7  3.31 Male   No     Sun   Dinner     2
## 5       24.6  3.61 Female No     Sun   Dinner     4
## 6       25.3  4.71 Male   No     Sun   Dinner     4
str(tips.vroom2)
## spec_tbl_df [244 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ total_bill: num [1:244] 17 10.3 21 23.7 24.6 ...
##  $ tip       : num [1:244] 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Sun","Sat","Thur",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int [1:244] 2 3 3 2 4 4 2 4 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   total_bill = col_number(),
##   ..   tip = col_number(),
##   ..   sex = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   smoker = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   day = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   time = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   size = col_integer(),
##   ..   .delim = ";"
##   .. )
##  - attr(*, "problems")=<externalptr>
object.size(tips.vroom2)
## 19416 bytes
# Readr
library(readr)
## Registered S3 methods overwritten by 'readr':
##   method                    from 
##   as.data.frame.spec_tbl_df vroom
##   as_tibble.spec_tbl_df     vroom
##   format.col_spec           vroom
##   print.col_spec            vroom
##   print.collector           vroom
##   print.date_names          vroom
##   print.locale              vroom
##   str.col_spec              vroom
## 
## Attache Paket: 'readr'
## Die folgenden Objekte sind maskiert von 'package:vroom':
## 
##     as.col_spec, col_character, col_date, col_datetime, col_double,
##     col_factor, col_guess, col_integer, col_logical, col_number,
##     col_skip, col_time, cols, cols_condense, cols_only, date_names,
##     date_names_lang, date_names_langs, default_locale, fwf_cols,
##     fwf_empty, fwf_positions, fwf_widths, locale, output_column,
##     problems, spec
tips.readr <- readr::read_csv2(url)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 244 Columns: 7
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (4): sex, smoker, day, time
## dbl (3): total_bill, tip, size
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tips.readr)
## # A tibble: 6 × 7
##   total_bill   tip sex    smoker day   time    size
##        <dbl> <dbl> <chr>  <chr>  <chr> <chr>  <dbl>
## 1       17.0  1.01 Female No     Sun   Dinner     2
## 2       10.3  1.66 Male   No     Sun   Dinner     3
## 3       21.0  3.5  Male   No     Sun   Dinner     3
## 4       23.7  3.31 Male   No     Sun   Dinner     2
## 5       24.6  3.61 Female No     Sun   Dinner     4
## 6       25.3  4.71 Male   No     Sun   Dinner     4
str(tips.readr)
## spec_tbl_df [244 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ total_bill: num [1:244] 17 10.3 21 23.7 24.6 ...
##  $ tip       : num [1:244] 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : chr [1:244] "Female" "Male" "Male" "Male" ...
##  $ smoker    : chr [1:244] "No" "No" "No" "No" ...
##  $ day       : chr [1:244] "Sun" "Sun" "Sun" "Sun" ...
##  $ time      : chr [1:244] "Dinner" "Dinner" "Dinner" "Dinner" ...
##  $ size      : num [1:244] 2 3 3 2 4 4 2 4 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   total_bill = col_double(),
##   ..   tip = col_double(),
##   ..   sex = col_character(),
##   ..   smoker = col_character(),
##   ..   day = col_character(),
##   ..   time = col_character(),
##   ..   size = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
object.size(tips.readr)
## 20632 bytes
# Readr
tips.readr2 <- readr::read_csv2(url,
                                col_types = list(
                                col_double(),  # total_bill
                                col_double(),  # tip
                                col_factor(),  # sex
                                col_factor(),  # smoker
                                col_factor(),  # day
                                col_factor(),  # time
                                col_integer()  # size
                     )
)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
head(tips.readr2)
## # A tibble: 6 × 7
##   total_bill   tip sex    smoker day   time    size
##        <dbl> <dbl> <fct>  <fct>  <fct> <fct>  <int>
## 1       17.0  1.01 Female No     Sun   Dinner     2
## 2       10.3  1.66 Male   No     Sun   Dinner     3
## 3       21.0  3.5  Male   No     Sun   Dinner     3
## 4       23.7  3.31 Male   No     Sun   Dinner     2
## 5       24.6  3.61 Female No     Sun   Dinner     4
## 6       25.3  4.71 Male   No     Sun   Dinner     4
str(tips.readr2)
## spec_tbl_df [244 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ total_bill: num [1:244] 17 10.3 21 23.7 24.6 ...
##  $ tip       : num [1:244] 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Sun","Sat","Thur",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int [1:244] 2 3 3 2 4 4 2 4 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   total_bill = col_double(),
##   ..   tip = col_double(),
##   ..   sex = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   smoker = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   day = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   time = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   size = col_integer()
##   .. )
##  - attr(*, "problems")=<externalptr>
object.size(tips.readr2)
## 19416 bytes
# Mit Bordmitteln von R
download.file("https://goo.gl/whKjnl", destfile = "tips.csv")
tips.csv2 <- read.csv2("tips.csv")
head(tips.csv2)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
str(tips.csv2)
## 'data.frame':    244 obs. of  7 variables:
##  $ total_bill: num  17 10.3 21 23.7 24.6 ...
##  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : chr  "Female" "Male" "Male" "Male" ...
##  $ smoker    : chr  "No" "No" "No" "No" ...
##  $ day       : chr  "Sun" "Sun" "Sun" "Sun" ...
##  $ time      : chr  "Dinner" "Dinner" "Dinner" "Dinner" ...
##  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...
object.size(tips.csv2)
## 14720 bytes
Norman Markgraf
Norman Markgraf
Diplom-Mathematiker

Norman Markgraf ist freiberuflicher Dozent für Mathematik, Statistik, Data Science und Informatik, sowie freiberuflicher Programmierer.

Ähnliches