suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(data.table))

dplyr

Various dplyr sample codes

rleid

mtcars%>%as_tibble()%>%mutate(rleid=(gear!=lag(gear,default = 0)))%>%mutate(rleid=cumsum(rleid))%>%head()
mpg cyl disp hp drat wt qsec vs am gear carb rleid
21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 1
21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 1
22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1
21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 2
18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 2
18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 2

rleid with index

mtcars%>%as_tibble()%>%
    mutate(rleid=(gear!=lag(gear,default = 0)))%>%
    mutate(rleid=cumsum(rleid))%>%
    group_by(rleid)%>%
    mutate(row_number=row_number())%>%
    ungroup()%>%head()
mpg cyl disp hp drat wt qsec vs am gear carb rleid row_number
21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 1 1
21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 1 2
22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1 3
21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 2 1
18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 2 2
18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 2 3

find repeated numbers

find_duplicated_seq<-function(vec){
    #find repeated numbers but exclude -1
    lead_vec=dplyr::lead(vec,default=0)
    out=as.numeric(vec==lead_vec&vec!=-1)
    out=as.numeric(out+dplyr::lag(out,default=0)!=0)
    return(out)
}
df=tibble(vec=c(1,2,4,5,5,5,6.1,6,6.2,-1,-1,-1,8,8,8))
df%>%mutate(out=find_duplicated_seq(vec))%>%head(15)
vec out
1.0 0
2.0 0
4.0 0
5.0 1
5.0 1
5.0 1
6.1 0
6.0 0
6.2 0
-1.0 0
-1.0 0
-1.0 0
8.0 1
8.0 1
8.0 1

impute n numbers of consecutive NAs

#     df=tibble(c=vec)
#     df=df%>%mutate(rn=row_number())
#     if(is.na(df$c[length(df$c)])){
#     df$c[length(df$c)]=tail(df$c[!is.na(df$c)],1)
#     }
#     if(is.na(df$c[1])){
#     df$c[1]=df$c[!is.na(df$c)][1]
#     }
#     tdf=df%>%filter(!is.na(c))
#     tdf_diff=tdf$rn%>%diff()
#     len_na=tdf_diff[tdf_diff!=1]-1
#     miss_idx=tdf$rn[(tdf_diff!=1)]

#     start_idx=miss_idx+1
#     end_idx=start_idx+len_na-1

#     start_idx_ip=start_idx[len_na<dt]
#     end_idx_ip=end_idx[len_na<dt]
#     start_idx_na=start_idx[len_na>=dt]
#     end_idx_na=end_idx[len_na>=dt]

#     dfi=df%>%mutate(c=imputeTS::na.interpolation(df$c))

#     for(i in 1:length(start_idx_na)){
#     dfi$c[start_idx_na[i]:end_idx_na[i]]=df$c[start_idx_na[i]:end_idx_na[i]]
#     }
#     return(dfi$c)
# }
# cc=c(1,2,NA,NA,5,6.1,6,NA,NA,NA,NA,NA,7,7.1,7.2,7.3,NA,NA,NA)
# impute_na_dt(vec=cc,dt=4)

slicing

Slice dataframe by row number. Negative for disselection. n() for the last row
https://dplyr.tidyverse.org/reference/slice.html

mtcars%>%as_tibble()%>%slice(1:5)%>%slice(c(-1,-n()))
mpg cyl disp hp drat wt qsec vs am gear carb
21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
21.4 6 258 110 3.08 3.215 19.44 1 0 3 1

transpose of dataframe

mapping=tibble(channel=c("c01","c02","c03","c04","c05"),
              type01=c("ahu1","disposal","ahu2","kitchen1","microwave"),
              type02=c("ahu1","disposal","kitchen1","ahu2","light"),
              type03=c("ahu1","ahu2","disposal","kitchen1","room"),
              type04=c("disposal","ahu1","ahu2","kitchen1","living"))
head(mapping)
print("Transposed dataframe")
# should be updated for pivot_wider / pivot_longer when R 4.0 is available.
mapping %>%
  gather(key = var_name, value = value, -1) %>%
  spread_(key = names(mapping)[1],value = 'value')%>%head()
channel type01 type02 type03 type04
c01 ahu1 ahu1 ahu1 disposal
c02 disposal disposal ahu2 ahu1
c03 ahu2 kitchen1 disposal ahu2
c04 kitchen1 ahu2 kitchen1 kitchen1
c05 microwave light room living
[1] "Transposed dataframe"
var_name c01 c02 c03 c04 c05
type01 ahu1 disposal ahu2 kitchen1 microwave
type02 ahu1 disposal kitchen1 ahu2 light
type03 ahu1 ahu2 disposal kitchen1 room
type04 disposal ahu1 ahu2 kitchen1 living

select columns by conditions

df=tibble(x=c(1,0,0,1),y=c(1,2,3,4),z=rep(NA_real_,4),x1=c(2,3,2,3))
print("all df")
df
print("select columns that do not have any NA value and two unique values")
df%>%select_if(list(~(length(unique(.))==2)&(!anyNA(.))))
df%>%select_if(list(~(n_distinct(.)==2)&(!anyNA(.))))
[1] "all df"
x y z x1
1 1 NA 2
0 2 NA 3
0 3 NA 2
1 4 NA 3
[1] "select columns that do not have any NA value and two unique values"
x x1
1 2
0 3
0 2
1 3
x x1
1 2
0 3
0 2
1 3

rolling joins through data.table

ecobee_changes <- data.table(
  change_id = c("c1", "c2", "c3", "c4"),
  change_time = ymd_hms(c("2020-11-10 15:05:00", "2020-11-10 16:15:00", "2020-11-10 16:20:00", "2020-11-10 17:30:00"))
)

interactions <- data.table(
  interaction_id = c("i1","i2","i3"),
  interaction_time = ymd_hms(c("2020-11-10 16:17:30", "2020-11-10 16:19:25",  "2020-11-10 17:31:30"))
)
ecobee_changes[, action_time := change_time]
interactions[, action_time := interaction_time]

setkey(ecobee_changes, "action_time")
setkey(interactions, "action_time")


out=as_tibble(ecobee_changes)%>%select(-action_time)%>%
  left_join(as_tibble(ecobee_changes[interactions, roll = "nearest"]),on=c("change_time","change_id"))

out
#interactions[ecobee_changes, roll = "nearest"]%>%as_tibble()%>%filter(distinct())
Joining, by = c("change_id", "change_time")
change_id change_time action_time interaction_id interaction_time
c1 2020-11-10 15:05:00 NA NA NA
c2 2020-11-10 16:15:00 2020-11-10 16:17:30 i1 2020-11-10 16:17:30
c3 2020-11-10 16:20:00 2020-11-10 16:19:25 i2 2020-11-10 16:19:25
c4 2020-11-10 17:30:00 2020-11-10 17:31:30 i3 2020-11-10 17:31:30
x = data.table(
  eventid=c(1,2,3),
  start  =mdy_hms(c('10/1/2016 04:30:00','10/1/2016 18:02:00','10/2/2016 14:21:00')),
  end    =mdy_hms(c('10/1/2016 05:43:00','10/2/2016 01:23:00','10/4/2016 08:54:00'))
)

#y is a data table with a list of all dates
y = data.table(
  date =c('10/1/2016','10/2/2016','10/3/2016','10/4/2016','10/5/2016','10/6/2016'),
  start=mdy_hms(c('10/1/2016 00:00:00','10/2/2016 00:00:00','10/3/2016 00:00:00','10/4/2016 00:00:00','10/5/2016 00:00:00','10/6/2016 00:00:00')),
  end  =mdy_hms(c('10/1/2016 23:59:59','10/2/2016 23:59:59','10/3/2016 23:59:59','10/4/2016 23:59:59','10/5/2016 23:59:59','10/6/2016 23:59:59'))
  )

#set the key on y to match with
setkey(y,"start","end")

#use the foverlaps function to match
#
#note that eventid 1 matches one date only
#          eventid 2 matches two dates
#          eventid 3 matches three dates
#
result <- foverlaps(x,y,type="any")

#show results
x
y

result
eventid start end
1 2016-10-01 04:30:00 2016-10-01 05:43:00
2 2016-10-01 18:02:00 2016-10-02 01:23:00
3 2016-10-02 14:21:00 2016-10-04 08:54:00
date start end
10/1/2016 2016-10-01 2016-10-01 23:59:59
10/2/2016 2016-10-02 2016-10-02 23:59:59
10/3/2016 2016-10-03 2016-10-03 23:59:59
10/4/2016 2016-10-04 2016-10-04 23:59:59
10/5/2016 2016-10-05 2016-10-05 23:59:59
10/6/2016 2016-10-06 2016-10-06 23:59:59
date start end eventid i.start i.end
10/1/2016 2016-10-01 2016-10-01 23:59:59 1 2016-10-01 04:30:00 2016-10-01 05:43:00
10/1/2016 2016-10-01 2016-10-01 23:59:59 2 2016-10-01 18:02:00 2016-10-02 01:23:00
10/2/2016 2016-10-02 2016-10-02 23:59:59 2 2016-10-01 18:02:00 2016-10-02 01:23:00
10/2/2016 2016-10-02 2016-10-02 23:59:59 3 2016-10-02 14:21:00 2016-10-04 08:54:00
10/3/2016 2016-10-03 2016-10-03 23:59:59 3 2016-10-02 14:21:00 2016-10-04 08:54:00
10/4/2016 2016-10-04 2016-10-04 23:59:59 3 2016-10-02 14:21:00 2016-10-04 08:54:00

purrr

Purrr is useful when to wrap a function for a tibble with vectorization.

map might be a general language, but I use pmap_x for general use.

mtcars%>%
    as_tibble()%>%
    mutate(new=pmap_dbl(list(mpg,cyl),
                        function(x,y){x*y+y^2+2}))%>%head()
mpg cyl disp hp drat wt qsec vs am gear carb new
21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 164.0
21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 164.0
22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 109.2
21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 166.4
18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 215.6
18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 146.6
custom_if_else=function(x,y){
    if (x==6 & y<=21){
        out="type1"
    }else if (x==8 & y<18.5){
        out="type2"
    }else{
        out="type3"
    }
    return (out)
}
mtcars%>%
    as_tibble()%>%
    mutate(type=pmap_chr(list(mpg,cyl),
                        function(x,y){custom_if_else(x,y)}))%>%head()
mpg cyl disp hp drat wt qsec vs am gear carb type
21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 type3
21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 type3
22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 type3
21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 type3
18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 type3
18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 type3

Lubridate

Find time difference

start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")
print(end_time-start_time)
print(difftime(end_time,start_time))
print(as.numeric(as.duration(end_time-start_time),units='minutes'))
print(interval(start_time,end_time)/minutes(1))
Time difference of 2.996528 days
Time difference of 2.996528 days
[1] 4315
[1] 4315

Create a time-grid

Create 5 minutes interval timestamp from start_time to end_time.

start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")
minute_diff=interval(start_time,end_time)/minutes(1)
# it creates
time_grid=start_time+(0:(minute_diff/5))*dminutes(5)
print(time_grid[1:2])
print(tail(time_grid,2))
[1] "2021-01-01 00:00:00 EST" "2021-01-01 00:05:00 EST"
[1] "2021-01-03 23:50:00 EST" "2021-01-03 23:55:00 EST"
print(sessionInfo())
R version 3.6.3 (2020-02-29)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] data.table_1.12.2 lubridate_1.7.4   forcats_0.5.0     stringr_1.4.0    
 [5] dplyr_0.8.0.1     purrr_0.3.4       readr_1.4.0       tidyr_0.8.3      
 [9] tibble_2.1.1      ggplot2_3.1.1     tidyverse_1.2.1  

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-3     tidyselect_0.2.5 repr_0.19.2      haven_2.3.1     
 [5] lattice_0.20-38  colorspace_1.4-1 generics_0.0.2   vctrs_0.3.5     
 [9] htmltools_0.3.6  base64enc_0.1-3  rlang_0.4.9      pillar_1.4.7    
[13] glue_1.4.2       withr_2.1.2      modelr_0.1.8     readxl_1.3.1    
[17] uuid_0.1-2       lifecycle_0.2.0  plyr_1.8.4       munsell_0.5.0   
[21] gtable_0.3.0     cellranger_1.1.0 rvest_0.3.3      evaluate_0.13   
[25] broom_0.5.2      IRdisplay_0.7.0  Rcpp_1.0.1       scales_1.0.0    
[29] backports_1.1.4  IRkernel_0.8.15  jsonlite_1.6     hms_0.5.3       
[33] digest_0.6.18    stringi_1.4.3    grid_3.6.3       cli_1.1.0       
[37] tools_3.6.3      magrittr_1.5     lazyeval_0.2.2   crayon_1.3.4    
[41] pkgconfig_2.0.2  ellipsis_0.3.1   xml2_1.2.0       assertthat_0.2.1
[45] httr_1.4.0       rstudioapi_0.13  R6_2.4.0         nlme_3.1-139    
[49] compiler_3.6.3