Kapitel 15 Daten
15.4 2023: Bike Sharing
15.5 2023: Yacht Prices
15.6 2023: Insurance
15.7 2022: NYC Airbnb
15.8 2022: Autopreise
15.9 2022: WHO Data
15.10 2021: Taxi Trips
Vorbereitungen
library(geodist)
library(osrm)
# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-taxi/nyctaxi.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))
# select a subset of the data
n <- 10000
set.seed(20210408)
d <- d[sample(nrow(d), size = n), ]
# Function to make points dataframe
make_pts <- function(x){
pts <- data.frame(long = c(x["pickup_longitude"], x["dropoff_longitude"]),
lat = c(x["pickup_latitude"], x["dropoff_latitude"]),
row.names = NULL)
return(pts)
}
# Function to request route information from openstreetmap
make_dist_duration <- function(x, type=c("car","bike","foot")[1]){
pts <- make_pts(x)
route <- osrm::osrmRoute(loc=pts, returnclass = "sf", osrm.profile = type)
return(c(osrm_distance=route$dist, osrm_duration=route$duration))
}
# Funtion to calculate geodesic_distance
make_geodesic_distance <- function(x){
pickup_loc <- c(longitude=x["pickup_longitude"],
latitude=x["pickup_latitude"])
dropoff_loc <- c(longitude=x["dropoff_longitude"],
latitude=x["dropoff_latitude"])
geodesic_distance <- geodist::geodist(x=pickup_loc,
y=dropoff_loc,
measure = "geodesic")
return(geodesic_distance)
}
# Getting route information involves an external server and needs to be batched
# so that it does not time out
mb <- 15 # microbatch size
f <- rep((1:ceiling(nrow(d)/mb)), each=mb)[1:nrow(d)] # splitkey
dlist <- split(d, f)
for (idx in 1:length(dlist)){
batch <- dlist[[idx]]
# Generate route information and add to dataset
try({
res <- apply(batch, MARGIN=1, make_dist_duration)
df <- cbind(batch, t(res))
# Add day of the week to dataset
df$weekday <- weekdays(as.Date(df$pickup_datetime))
# Add geodesic distance to dataset
df$geodesic_distance <- apply(df, MARGIN=1, make_geodesic_distance)
# caching: export to csv
filename <- paste0("cache_",idx,"-2021_fhswf_stat_taxi.csv")
write.csv(df,
file = file.path(coursepath,savepath,filename),
row.names = FALSE)
})
Sys.sleep(runif(1,1,mb))
message(idx)
}
files <- list.files(file.path(coursepath,savepath),
pattern = "^cache*", full.names = T)
length(files)
flist <- lapply(files, read.csv)
df <- do.call("rbind", flist)
# write final csv
write.csv(df,
file = file.path(coursepath,savepath,"2021_fhswf_stat_taxi.csv"),
row.names = FALSE)
# delete cache files
unlink(files)
Datenbeschreibung
Variablen
id
: a unique identifier for each tripvendor_id
: a code indicating the provider associated with the trip recordpickup_datetime
: date and time when the meter was engageddropoff_datetime
: date and time when the meter was disengagedpassenger_count
: the number of passengers in the vehicle (driver entered value)pickup_longitude
: the longitude where the meter was engagedpickup_latitude
: the latitude where the meter was engageddropoff_longitude
: the longitude where the meter was disengageddropoff_latitude
: the latitude where the meter was disengagedstore_and_fwd_flag
: This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward triptrip_duration
: duration of the trip in seconds
Ergänzte Variablen
osrm_distance
: the travel distance by car in kilometers as calculated by the Open Streetmap Routing Machineosrm_duration
: the estimated duration for travelling the route by car in minutes as estimated by the Open Streetmap Routing Machinegeodesic_distance
: the geodesic distance in meters between the pickup and dropoff coordinates calculated using the method of Karney (2013)weekday:
the day of the week when the trip was started
15.11 2021: Hauspreise
Vorbereitungen
library(tidyverse)
# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-housing/germany_housing.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))
# rename columns and select relevant ones
d <- d %>%
rename(
id = X,
price = Price,
type = Type,
living_space = Living_space,
lot_size = Lot,
additional_area = Usable_area,
availability = Free_of_Relation,
rooms = Rooms,
bedrooms = Bedrooms,
bathrooms = Bathrooms,
floors = Floors,
year_construction = Year_built,
year_modernization = Year_renovated,
condition = Condition,
heating = Heating,
facility_quality = Furnishing_quality,
energy_source = Energy_source,
energy_certificate = Energy_certificate,
energy_certificate_type = Energy_certificate_type,
energy_consumption = Energy_consumption,
energy_efficiency_class = Energy_efficiency_class,
state = State,
district = City,
city = Place,
parking_lot = Garages,
parking_lot_type = Garagetype
) %>%
select(-availability)
# export to csv
write.csv(d,
file = file.path(coursepath,savepath,"2021_fhswf_stat_housing.csv"),
row.names = FALSE)
Datenbeschreibung
id
: a unique identifier for each offerprice
: the requested price in Euro to buy the buildingtype
: the type of the offered buildingliving_space
: the size of the usable living space in the building in square meterslot_size
: the size of the associated lot in square metersadditional_area
: the size of the additionally available area in square meters that is not qualified living space due to legal reasonsrooms
: the total number of rooms in the buildingbedrooms
: the number of bedrooms in the buildingbathrooms
: the number of bathrooms in the buildingfloors
: number of storys in the buildingyear_construction
: the year in which the building was first constructedyear_modernization
: the year in which the building was lastly renovatedfacility_quality
: an indication of quality of the installed fixtures, facilities and if applicable furniturecondition
: the general condition of the building and facilitiesheating
: the primary technology for heating the buildingenergy_source
: the primary source of energy for heating and warm waterenergy_certificate
: an indication wheter and energy consumption certificate is required and/or availableenergy_certificate_type
: the type of the available energy certificationenergy_consumption
: the required amount of energy in \(kWh/m^2\) per year to heat the buildingenergy_efficiency_class
: an indication of the quality of the thermal insulation of the building according to the german EnEV, ranging from A+ (best) to H (worst)state
: the federal state of germany in which the building is locateddistrict
: the respective district of the federal stat in which the building is locatedcity
: the name of the city to whom the building is allocatedparking_lot
: the amount of registered parking lots that belong to the buildingparking_lot_type
: the type of the available parking lots
15.12 2021: Human Ressources
Vorbereitungen
# load raw data
coursepath <- "~/sciebo/courses/bchwtz-stat"
filepath <- "nongit/data/2021-hr/HRDataset_v14.csv"
savepath <- "data"
d <- read.csv(file.path(coursepath, filepath))
# harmonize variable names
names(d) <- gsub("_","",names(d))
# export to csv
write.csv(d,
file = file.path(coursepath,savepath,"2021_fhswf_stat_hr.csv"),
row.names = FALSE)
Datenbeschreibung
EmployeeName
: Employee’s full name dEmpID
: Employee ID is unique to each employee
MarriedID
: Is the person married (1 or 0 for yes or no)
MaritalStatusID
: Marital status code that matches the text field MaritalDescEmpStatusID
: Employment status code that matches text field EmploymentStatusDeptID Department
: ID code that matches the department the employee works in
PerfScoreID
: Performance Score code that matches the employee’s most recent performance score
FromDiversityJobFairID
: Was the employee sourced from the Diversity job fair? 1 or 0 for yes or no
PayRate
: The person’s hourly pay rate. All salaries are converted to hourly pay rateTermd
: Has this employee been terminated - 1 or 0
PositionID
: An integer indicating the person’s positionPosition
: The text name/title of the position the person has
State
: The state that the person lives in
Zip
: The zip code for the employee
DOB
: Date of Birth for the employee
Sex
: Sex - M or F
MaritalDesc
: The marital status of the person (divorced, single, widowed, separated, etc)
CitizenDesc
: Label for whether the person is a Citizen or Eligible NonCitizen
HispanicLatino
: Yes or No field for whether the employee is Hispanic/LatinoRaceDesc
: Description/text of the race the person identifies withDateofHire
: Date the person was hired
DateofTermination
: Date the person was terminated, only populated if, in fact, Termd = 1
TermReason
: A text reason / description for why the person was terminated
EmploymentStatus
: A description/category of the person’s employment status. Anyone currently working full time = Active
Department
: Name of the department that the person works inManagerName
: The name of the person’s immediate manager
ManagerID
: A unique identifier for each managerRecruitmentSource
: The name of the recruitment source where the employee was recruited from
PerformanceScore
: Performance Score text/category (Fully Meets, Partially Meets, PIP, Exceeds)
EngagementSurvey
: Results from the last engagement survey, managed by our external partner
EmpSatisfaction
: A basic satisfaction score between 1 and 5, as reported on a recent employee satisfaction surveySpecialProjectsCount
: The number of special projects that the employee worked on during the last 6 monthsLastPerformanceReviewDate
: The most recent date of the person’s last performance review
DaysLateLast30
: The number of times that the employee was late to work during the last 30 days
Absences
: The number of times the employee was absent from work
Tips & Tricks
# working with dates and times
x$Date_Hiring <- as.Date(x$DateofHire,format='%m/%d/%Y')
x$Date_Termination <- as.Date(x$DateofTermination,format='%m/%d/%Y')
# example difference between two timepoints
difftime(x$Date_Termination[2],x$Date_Hiring[2], units = "days")
## Time difference of 444 days