My Tidyverse note
Sang's personal Tidyverse note
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(data.table))
mtcars%>%as_tibble()%>%mutate(rleid=(gear!=lag(gear,default = 0)))%>%mutate(rleid=cumsum(rleid))%>%head()
mtcars%>%as_tibble()%>%
mutate(rleid=(gear!=lag(gear,default = 0)))%>%
mutate(rleid=cumsum(rleid))%>%
group_by(rleid)%>%
mutate(row_number=row_number())%>%
ungroup()%>%head()
find_duplicated_seq<-function(vec){
#find repeated numbers but exclude -1
lead_vec=dplyr::lead(vec,default=0)
out=as.numeric(vec==lead_vec&vec!=-1)
out=as.numeric(out+dplyr::lag(out,default=0)!=0)
return(out)
}
df=tibble(vec=c(1,2,4,5,5,5,6.1,6,6.2,-1,-1,-1,8,8,8))
df%>%mutate(out=find_duplicated_seq(vec))%>%head(15)
# df=tibble(c=vec)
# df=df%>%mutate(rn=row_number())
# if(is.na(df$c[length(df$c)])){
# df$c[length(df$c)]=tail(df$c[!is.na(df$c)],1)
# }
# if(is.na(df$c[1])){
# df$c[1]=df$c[!is.na(df$c)][1]
# }
# tdf=df%>%filter(!is.na(c))
# tdf_diff=tdf$rn%>%diff()
# len_na=tdf_diff[tdf_diff!=1]-1
# miss_idx=tdf$rn[(tdf_diff!=1)]
# start_idx=miss_idx+1
# end_idx=start_idx+len_na-1
# start_idx_ip=start_idx[len_na<dt]
# end_idx_ip=end_idx[len_na<dt]
# start_idx_na=start_idx[len_na>=dt]
# end_idx_na=end_idx[len_na>=dt]
# dfi=df%>%mutate(c=imputeTS::na.interpolation(df$c))
# for(i in 1:length(start_idx_na)){
# dfi$c[start_idx_na[i]:end_idx_na[i]]=df$c[start_idx_na[i]:end_idx_na[i]]
# }
# return(dfi$c)
# }
# cc=c(1,2,NA,NA,5,6.1,6,NA,NA,NA,NA,NA,7,7.1,7.2,7.3,NA,NA,NA)
# impute_na_dt(vec=cc,dt=4)
slicing
Slice dataframe by row number. Negative for disselection. n() for the last row
https://dplyr.tidyverse.org/reference/slice.html
mtcars%>%as_tibble()%>%slice(1:5)%>%slice(c(-1,-n()))
mapping=tibble(channel=c("c01","c02","c03","c04","c05"),
type01=c("ahu1","disposal","ahu2","kitchen1","microwave"),
type02=c("ahu1","disposal","kitchen1","ahu2","light"),
type03=c("ahu1","ahu2","disposal","kitchen1","room"),
type04=c("disposal","ahu1","ahu2","kitchen1","living"))
head(mapping)
print("Transposed dataframe")
# should be updated for pivot_wider / pivot_longer when R 4.0 is available.
mapping %>%
gather(key = var_name, value = value, -1) %>%
spread_(key = names(mapping)[1],value = 'value')%>%head()
df=tibble(x=c(1,0,0,1),y=c(1,2,3,4),z=rep(NA_real_,4),x1=c(2,3,2,3))
print("all df")
df
print("select columns that do not have any NA value and two unique values")
df%>%select_if(list(~(length(unique(.))==2)&(!anyNA(.))))
df%>%select_if(list(~(n_distinct(.)==2)&(!anyNA(.))))
ecobee_changes <- data.table(
change_id = c("c1", "c2", "c3", "c4"),
change_time = ymd_hms(c("2020-11-10 15:05:00", "2020-11-10 16:15:00", "2020-11-10 16:20:00", "2020-11-10 17:30:00"))
)
interactions <- data.table(
interaction_id = c("i1","i2","i3"),
interaction_time = ymd_hms(c("2020-11-10 16:17:30", "2020-11-10 16:19:25", "2020-11-10 17:31:30"))
)
ecobee_changes[, action_time := change_time]
interactions[, action_time := interaction_time]
setkey(ecobee_changes, "action_time")
setkey(interactions, "action_time")
out=as_tibble(ecobee_changes)%>%select(-action_time)%>%
left_join(as_tibble(ecobee_changes[interactions, roll = "nearest"]),on=c("change_time","change_id"))
out
#interactions[ecobee_changes, roll = "nearest"]%>%as_tibble()%>%filter(distinct())
x = data.table(
eventid=c(1,2,3),
start =mdy_hms(c('10/1/2016 04:30:00','10/1/2016 18:02:00','10/2/2016 14:21:00')),
end =mdy_hms(c('10/1/2016 05:43:00','10/2/2016 01:23:00','10/4/2016 08:54:00'))
)
#y is a data table with a list of all dates
y = data.table(
date =c('10/1/2016','10/2/2016','10/3/2016','10/4/2016','10/5/2016','10/6/2016'),
start=mdy_hms(c('10/1/2016 00:00:00','10/2/2016 00:00:00','10/3/2016 00:00:00','10/4/2016 00:00:00','10/5/2016 00:00:00','10/6/2016 00:00:00')),
end =mdy_hms(c('10/1/2016 23:59:59','10/2/2016 23:59:59','10/3/2016 23:59:59','10/4/2016 23:59:59','10/5/2016 23:59:59','10/6/2016 23:59:59'))
)
#set the key on y to match with
setkey(y,"start","end")
#use the foverlaps function to match
#
#note that eventid 1 matches one date only
# eventid 2 matches two dates
# eventid 3 matches three dates
#
result <- foverlaps(x,y,type="any")
#show results
x
y
result
Purrr is useful when to wrap a function for a tibble with vectorization.
map
might be a general language, but I use pmap_x
for general use.
mtcars%>%
as_tibble()%>%
mutate(new=pmap_dbl(list(mpg,cyl),
function(x,y){x*y+y^2+2}))%>%head()
custom_if_else=function(x,y){
if (x==6 & y<=21){
out="type1"
}else if (x==8 & y<18.5){
out="type2"
}else{
out="type3"
}
return (out)
}
mtcars%>%
as_tibble()%>%
mutate(type=pmap_chr(list(mpg,cyl),
function(x,y){custom_if_else(x,y)}))%>%head()
start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")
print(end_time-start_time)
print(difftime(end_time,start_time))
print(as.numeric(as.duration(end_time-start_time),units='minutes'))
print(interval(start_time,end_time)/minutes(1))
start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")
minute_diff=interval(start_time,end_time)/minutes(1)
# it creates
time_grid=start_time+(0:(minute_diff/5))*dminutes(5)
print(time_grid[1:2])
print(tail(time_grid,2))
print(sessionInfo())