Kapitel 15 Daten

15.1 2024: Guide Michelin

library(fhswf)       # Laden des fhswf-Packages
View(michelin)       # Anzeigen der Daten
michelin             # Anzeigen der Daten auf der Konsole
?michelin            # Anzeigen der Datenbeschreibung

15.2 2024: Flixbus

library(fhswf)       # Laden des fhswf-Packages
View(flixbus)        # Anzeigen der Daten
flixbus              # Anzeigen der Daten auf der Konsole
?flixbus             # Anzeigen der Datenbeschreibung

15.3 2024: Lego

library(fhswf)       # Laden des fhswf-Packages
View(lego)           # Anzeigen der Daten
lego                 # Anzeigen der Daten auf der Konsole
?lego                # Anzeigen der Datenbeschreibung

15.4 2023: Bike Sharing

library(fhswf)       # Laden des fhswf-Packages
View(bikesharing)    # Anzeigen der Daten
bikesharing          # Anzeigen der Daten auf der Konsole
?bikesharing         # Anzeigen der Datenbeschreibung

Beispiel

# Deskriptive Statistik der benutzten Fahrräder pro Tag
summary(bikesharing$cnt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      22    3152    4548    4504    5956    8714

15.5 2023: Yacht Prices

library(fhswf)       # Laden des fhswf-Packages
View(boatprices)     # Anzeigen der Daten 
boatprices           # Anzeigen der Daten auf der Konsole
?boatprices          # Anzeigen der Datenbeschreibung

Beispiel

# Deskriptive Statistik des Preises der gelisteten Boote in Euro
summary(boatprices$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3500   46670   85000  109823  160000  299750

15.6 2023: Insurance

library(fhswf)       # Laden des fhswf-Packages
View(insurance)      # Anzeigen der Daten
insurance            # Anzeigen der Daten auf der Konsole
?insurance           # Anzeigen der Datenbeschreibung

Beispiel

# Deskriptive Statistik der jährlichen Versicherungskosten in US Dollar
summary(insurance$charges)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1122    4740    9382   13270   16640   63770

15.7 2022: NYC Airbnb

library(fhswf)       # Laden des fhswf-Packages
View(nycairbnb)      # Anzeigen der Daten
nycairbnb            # Anzeigen der Daten auf der Konsole
?nycairbnb           # Anzeigen der Datenbeschreibung

Beispiel

summary(nycairbnb$price) # Deskriptive Statistik des Preises pro Übernachtung

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    69.0   106.0   152.7   175.0 10000.0

15.8 2022: Autopreise

library(fhswf)       # Laden des fhswf-Packages
View(carprices)      # Anzeigen der Daten
carprices            # Anzeigen der Daten auf der Konsole
?carprices           # Anzeigen der Datenbeschreibung

Beispiel

summary(carprices$price) # Deskriptive Statistik des Kaufpreises

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5118    7775   10295   13207   16500   45400       4

15.9 2022: WHO Data

library(fhswf)     # Laden des fhswf-Packages
View(whodata)      # Anzeigen der Daten
whodata            # Anzeigen der Daten auf der Konsole
?whodata           # Anzeigen der Datenbeschreibung

Beispiel

# Human Development Index von Deutschland im Jahr 2015
whodata[whodata$country_code == "DEU" & whodata$year == 2015, ]$hdi

## [1] 0.938

# Mittlerer Human Development Index über alle (verfügbaren) Länder in 2017
median(whodata[whodata$year == 2017, ]$hdi, na.rm = TRUE)

## [1] 0.736

15.10 2021: Taxi Trips

Vorbereitungen

library(geodist)
library(osrm)

# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-taxi/nyctaxi.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))

# select a subset of the data
n <- 10000
set.seed(20210408)
d <- d[sample(nrow(d), size = n), ]

# Function to make points dataframe
make_pts <- function(x){
  pts <- data.frame(long = c(x["pickup_longitude"], x["dropoff_longitude"]), 
                    lat = c(x["pickup_latitude"], x["dropoff_latitude"]),
                    row.names = NULL)
  return(pts)
}

# Function to request route information from openstreetmap
make_dist_duration <- function(x, type=c("car","bike","foot")[1]){
  pts <- make_pts(x)
  route <- osrm::osrmRoute(loc=pts, returnclass = "sf", osrm.profile = type)
  return(c(osrm_distance=route$dist, osrm_duration=route$duration))
}

# Funtion to calculate geodesic_distance
make_geodesic_distance <- function(x){
  pickup_loc <- c(longitude=x["pickup_longitude"],
                  latitude=x["pickup_latitude"])
  dropoff_loc <- c(longitude=x["dropoff_longitude"],
                   latitude=x["dropoff_latitude"])
  geodesic_distance <- geodist::geodist(x=pickup_loc,
                                        y=dropoff_loc, 
                                        measure = "geodesic")
  return(geodesic_distance)
}


# Getting route information involves an external server and needs to be batched
# so that it does not time out

mb <- 15 # microbatch size
f <- rep((1:ceiling(nrow(d)/mb)), each=mb)[1:nrow(d)] # splitkey
dlist <- split(d, f)


for (idx in 1:length(dlist)){
  
  batch <- dlist[[idx]]
  # Generate route information and add to dataset
  try({
  res <- apply(batch, MARGIN=1, make_dist_duration)
  df <- cbind(batch, t(res))
  
  # Add day of the week to dataset
  df$weekday <- weekdays(as.Date(df$pickup_datetime))
  
  # Add geodesic distance to dataset
  df$geodesic_distance <- apply(df, MARGIN=1, make_geodesic_distance)
  
  # caching: export to csv
  filename <- paste0("cache_",idx,"-2021_fhswf_stat_taxi.csv")
  write.csv(df,
            file = file.path(coursepath,savepath,filename),
            row.names = FALSE)
  })
  
  Sys.sleep(runif(1,1,mb))
  message(idx)
}

files <- list.files(file.path(coursepath,savepath), 
                    pattern = "^cache*", full.names = T)
length(files)
flist <- lapply(files, read.csv)
df <- do.call("rbind", flist)

# write final csv
write.csv(df,
          file = file.path(coursepath,savepath,"2021_fhswf_stat_taxi.csv"),
          row.names = FALSE)

# delete cache files
unlink(files)

Datenbeschreibung

Variablen

id: a unique identifier for each trip
vendor_id: a code indicating the provider associated with the trip record
pickup_datetime: date and time when the meter was engaged
dropoff_datetime: date and time when the meter was disengaged
passenger_count: the number of passengers in the vehicle (driver entered value)
pickup_longitude: the longitude where the meter was engaged
pickup_latitude: the latitude where the meter was engaged
dropoff_longitude: the longitude where the meter was disengaged
dropoff_latitude: the latitude where the meter was disengaged
store_and_fwd_flag: This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward trip
trip_duration: duration of the trip in seconds

Ergänzte Variablen

osrm_distance: the travel distance by car in kilometers as calculated by the Open Streetmap Routing Machine
osrm_duration: the estimated duration for travelling the route by car in minutes as estimated by the Open Streetmap Routing Machine
geodesic_distance: the geodesic distance in meters between the pickup and dropoff coordinates calculated using the method of Karney (2013)
weekday: the day of the week when the trip was started

Daten laden

# load the data
x <- read.csv("https://bchwtz.github.io/bchwtz-stat/data/2021_fhswf_stat_taxi.csv")

Tips & Tricks

# working with dates and times
x$pickup_datetime <- as.POSIXlt(x$pickup_datetime, tz="America/New_York")
x$pickup_datetime[1]

## [1] "2016-04-29 13:32:54 EDT"

weekdays(x$pickup_datetime[1])

## [1] "Friday"

months(x$pickup_datetime[1])

## [1] "April"

Quellen

15.11 2021: Hauspreise

Vorbereitungen

library(tidyverse)
# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-housing/germany_housing.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))
# rename columns and select relevant ones
d <- d %>%
  rename(
    id = X,
    price = Price,
    type = Type,
    living_space = Living_space,
    lot_size = Lot,
    additional_area = Usable_area,
    availability = Free_of_Relation,
    rooms = Rooms,
    bedrooms = Bedrooms,
    bathrooms = Bathrooms,
    floors = Floors,
    year_construction = Year_built,
    year_modernization = Year_renovated,
    condition = Condition,
    heating = Heating,
    facility_quality = Furnishing_quality,
    energy_source = Energy_source,
    energy_certificate = Energy_certificate,
    energy_certificate_type = Energy_certificate_type,
    energy_consumption = Energy_consumption,
    energy_efficiency_class = Energy_efficiency_class,
    state = State,
    district = City,
    city = Place,
    parking_lot = Garages,
    parking_lot_type = Garagetype
  ) %>%
  select(-availability)

# export to csv
write.csv(d,
          file = file.path(coursepath,savepath,"2021_fhswf_stat_housing.csv"),
          row.names = FALSE)

Datenbeschreibung

id: a unique identifier for each offer
price: the requested price in Euro to buy the building
type: the type of the offered building
living_space: the size of the usable living space in the building in square meters
lot_size: the size of the associated lot in square meters
additional_area: the size of the additionally available area in square meters that is not qualified living space due to legal reasons
rooms: the total number of rooms in the building
bedrooms: the number of bedrooms in the building
bathrooms: the number of bathrooms in the building
floors: number of storys in the building
year_construction: the year in which the building was first constructed
year_modernization: the year in which the building was lastly renovated
facility_quality: an indication of quality of the installed fixtures, facilities and if applicable furniture
condition: the general condition of the building and facilities
heating: the primary technology for heating the building
energy_source: the primary source of energy for heating and warm water
energy_certificate: an indication wheter and energy consumption certificate is required and/or available
energy_certificate_type: the type of the available energy certification
energy_consumption: the required amount of energy in \(kWh/m^2\) per year to heat the building
energy_efficiency_class: an indication of the quality of the thermal insulation of the building according to the german EnEV, ranging from A+ (best) to H (worst)
state: the federal state of germany in which the building is located
district: the respective district of the federal stat in which the building is located
city: the name of the city to whom the building is allocated
parking_lot: the amount of registered parking lots that belong to the building
parking_lot_type: the type of the available parking lots

Daten laden

# load the data
x <- read.csv("https://bchwtz.github.io/bchwtz-stat/data/2021_fhswf_stat_housing.csv")

Quellen

https://www.kaggle.com/scriptsultan/german-house-prices

15.12 2021: Human Ressources

Vorbereitungen

# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-hr/HRDataset_v14.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))

# harmonize variable names
names(d) <- gsub("_","",names(d))

# export to csv
write.csv(d,
          file = file.path(coursepath,savepath,"2021_fhswf_stat_hr.csv"),
          row.names = FALSE)

Datenbeschreibung

EmployeeName: Employee’s full name d
EmpID: Employee ID is unique to each employee
MarriedID: Is the person married (1 or 0 for yes or no)
MaritalStatusID: Marital status code that matches the text field MaritalDesc
EmpStatusID: Employment status code that matches text field EmploymentStatus
DeptID Department: ID code that matches the department the employee works in
PerfScoreID: Performance Score code that matches the employee’s most recent performance score
FromDiversityJobFairID: Was the employee sourced from the Diversity job fair? 1 or 0 for yes or no
PayRate: The person’s hourly pay rate. All salaries are converted to hourly pay rate
Termd: Has this employee been terminated - 1 or 0
PositionID: An integer indicating the person’s position
Position: The text name/title of the position the person has
State: The state that the person lives in
Zip: The zip code for the employee
DOB: Date of Birth for the employee
Sex: Sex - M or F
MaritalDesc: The marital status of the person (divorced, single, widowed, separated, etc)
CitizenDesc: Label for whether the person is a Citizen or Eligible NonCitizen
HispanicLatino: Yes or No field for whether the employee is Hispanic/Latino
RaceDesc: Description/text of the race the person identifies with
DateofHire: Date the person was hired
DateofTermination: Date the person was terminated, only populated if, in fact, Termd = 1
TermReason: A text reason / description for why the person was terminated
EmploymentStatus: A description/category of the person’s employment status. Anyone currently working full time = Active
Department: Name of the department that the person works in
ManagerName: The name of the person’s immediate manager
ManagerID: A unique identifier for each manager
RecruitmentSource: The name of the recruitment source where the employee was recruited from
PerformanceScore: Performance Score text/category (Fully Meets, Partially Meets, PIP, Exceeds)
EngagementSurvey: Results from the last engagement survey, managed by our external partner
EmpSatisfaction: A basic satisfaction score between 1 and 5, as reported on a recent employee satisfaction survey
SpecialProjectsCount: The number of special projects that the employee worked on during the last 6 months
LastPerformanceReviewDate: The most recent date of the person’s last performance review
DaysLateLast30: The number of times that the employee was late to work during the last 30 days
Absences: The number of times the employee was absent from work

Daten laden

# load the data
x <- read.csv("https://bchwtz.github.io/bchwtz-stat/data/2021_fhswf_stat_hr.csv")

Tips & Tricks

# working with dates and times
x$Date_Hiring <- as.Date(x$DateofHire,format='%m/%d/%Y')
x$Date_Termination <- as.Date(x$DateofTermination,format='%m/%d/%Y')

# example difference between two timepoints
difftime(x$Date_Termination[2],x$Date_Hiring[2], units = "days")

## Time difference of 444 days