suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(data.table))

dplyr

Various dplyr sample codes

rleid

mtcars%>%as_tibble()%>%mutate(rleid=(gear!=lag(gear,default = 0)))%>%mutate(rleid=cumsum(rleid))%>%head()

rleid with index

mtcars%>%as_tibble()%>%
    mutate(rleid=(gear!=lag(gear,default = 0)))%>%
    mutate(rleid=cumsum(rleid))%>%
    group_by(rleid)%>%
    mutate(row_number=row_number())%>%
    ungroup()%>%head()

find repeated numbers

find_duplicated_seq<-function(vec){
    #find repeated numbers but exclude -1
    lead_vec=dplyr::lead(vec,default=0)
    out=as.numeric(vec==lead_vec&vec!=-1)
    out=as.numeric(out+dplyr::lag(out,default=0)!=0)
    return(out)
}
df=tibble(vec=c(1,2,4,5,5,5,6.1,6,6.2,-1,-1,-1,8,8,8))
df%>%mutate(out=find_duplicated_seq(vec))%>%head(15)

impute n numbers of consecutive NAs

#     df=tibble(c=vec)
#     df=df%>%mutate(rn=row_number())
#     if(is.na(df$c[length(df$c)])){
#     df$c[length(df$c)]=tail(df$c[!is.na(df$c)],1)
#     }
#     if(is.na(df$c[1])){
#     df$c[1]=df$c[!is.na(df$c)][1]
#     }
#     tdf=df%>%filter(!is.na(c))
#     tdf_diff=tdf$rn%>%diff()
#     len_na=tdf_diff[tdf_diff!=1]-1
#     miss_idx=tdf$rn[(tdf_diff!=1)]

#     start_idx=miss_idx+1
#     end_idx=start_idx+len_na-1

#     start_idx_ip=start_idx[len_na<dt]
#     end_idx_ip=end_idx[len_na<dt]
#     start_idx_na=start_idx[len_na>=dt]
#     end_idx_na=end_idx[len_na>=dt]

#     dfi=df%>%mutate(c=imputeTS::na.interpolation(df$c))

#     for(i in 1:length(start_idx_na)){
#     dfi$c[start_idx_na[i]:end_idx_na[i]]=df$c[start_idx_na[i]:end_idx_na[i]]
#     }
#     return(dfi$c)
# }
# cc=c(1,2,NA,NA,5,6.1,6,NA,NA,NA,NA,NA,7,7.1,7.2,7.3,NA,NA,NA)
# impute_na_dt(vec=cc,dt=4)

slicing

Slice dataframe by row number. Negative for disselection. n() for the last row
https://dplyr.tidyverse.org/reference/slice.html

mtcars%>%as_tibble()%>%slice(1:5)%>%slice(c(-1,-n()))

transpose of dataframe

mapping=tibble(channel=c("c01","c02","c03","c04","c05"),
              type01=c("ahu1","disposal","ahu2","kitchen1","microwave"),
              type02=c("ahu1","disposal","kitchen1","ahu2","light"),
              type03=c("ahu1","ahu2","disposal","kitchen1","room"),
              type04=c("disposal","ahu1","ahu2","kitchen1","living"))
head(mapping)
print("Transposed dataframe")
# should be updated for pivot_wider / pivot_longer when R 4.0 is available.
mapping %>%
  gather(key = var_name, value = value, -1) %>%
  spread_(key = names(mapping)[1],value = 'value')%>%head()

[1] "Transposed dataframe"

select columns by conditions

df=tibble(x=c(1,0,0,1),y=c(1,2,3,4),z=rep(NA_real_,4),x1=c(2,3,2,3))
print("all df")
df
print("select columns that do not have any NA value and two unique values")
df%>%select_if(list(~(length(unique(.))==2)&(!anyNA(.))))
df%>%select_if(list(~(n_distinct(.)==2)&(!anyNA(.))))

[1] "all df"

[1] "select columns that do not have any NA value and two unique values"

rolling joins through data.table

ecobee_changes <- data.table(
  change_id = c("c1", "c2", "c3", "c4"),
  change_time = ymd_hms(c("2020-11-10 15:05:00", "2020-11-10 16:15:00", "2020-11-10 16:20:00", "2020-11-10 17:30:00"))
)

interactions <- data.table(
  interaction_id = c("i1","i2","i3"),
  interaction_time = ymd_hms(c("2020-11-10 16:17:30", "2020-11-10 16:19:25",  "2020-11-10 17:31:30"))
)

ecobee_changes[, action_time := change_time]
interactions[, action_time := interaction_time]

setkey(ecobee_changes, "action_time")
setkey(interactions, "action_time")


out=as_tibble(ecobee_changes)%>%select(-action_time)%>%
  left_join(as_tibble(ecobee_changes[interactions, roll = "nearest"]),on=c("change_time","change_id"))

out
#interactions[ecobee_changes, roll = "nearest"]%>%as_tibble()%>%filter(distinct())

Joining, by = c("change_id", "change_time")

x = data.table(
  eventid=c(1,2,3),
  start  =mdy_hms(c('10/1/2016 04:30:00','10/1/2016 18:02:00','10/2/2016 14:21:00')),
  end    =mdy_hms(c('10/1/2016 05:43:00','10/2/2016 01:23:00','10/4/2016 08:54:00'))
)

#y is a data table with a list of all dates
y = data.table(
  date =c('10/1/2016','10/2/2016','10/3/2016','10/4/2016','10/5/2016','10/6/2016'),
  start=mdy_hms(c('10/1/2016 00:00:00','10/2/2016 00:00:00','10/3/2016 00:00:00','10/4/2016 00:00:00','10/5/2016 00:00:00','10/6/2016 00:00:00')),
  end  =mdy_hms(c('10/1/2016 23:59:59','10/2/2016 23:59:59','10/3/2016 23:59:59','10/4/2016 23:59:59','10/5/2016 23:59:59','10/6/2016 23:59:59'))
  )

#set the key on y to match with
setkey(y,"start","end")

#use the foverlaps function to match
#
#note that eventid 1 matches one date only
#          eventid 2 matches two dates
#          eventid 3 matches three dates
#
result <- foverlaps(x,y,type="any")

#show results
x
y

result

purrr

Purrr is useful when to wrap a function for a tibble with vectorization.

map might be a general language, but I use pmap_x for general use.

mtcars%>%
    as_tibble()%>%
    mutate(new=pmap_dbl(list(mpg,cyl),
                        function(x,y){x*y+y^2+2}))%>%head()

custom_if_else=function(x,y){
    if (x==6 & y<=21){
        out="type1"
    }else if (x==8 & y<18.5){
        out="type2"
    }else{
        out="type3"
    }
    return (out)
}

mtcars%>%
    as_tibble()%>%
    mutate(type=pmap_chr(list(mpg,cyl),
                        function(x,y){custom_if_else(x,y)}))%>%head()

Lubridate

Find time difference

start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")

print(end_time-start_time)
print(difftime(end_time,start_time))
print(as.numeric(as.duration(end_time-start_time),units='minutes'))
print(interval(start_time,end_time)/minutes(1))

Time difference of 2.996528 days
Time difference of 2.996528 days
[1] 4315
[1] 4315

Create a time-grid

Create 5 minutes interval timestamp from start_time to end_time.

start_time=ymd("2021-01-01",tz="America/Indianapolis")
end_time=ymd_hms("2021-01-03 23:55:00",tz="America/Indianapolis")
minute_diff=interval(start_time,end_time)/minutes(1)

# it creates
time_grid=start_time+(0:(minute_diff/5))*dminutes(5)
print(time_grid[1:2])
print(tail(time_grid,2))

[1] "2021-01-01 00:00:00 EST" "2021-01-01 00:05:00 EST"
[1] "2021-01-03 23:50:00 EST" "2021-01-03 23:55:00 EST"

print(sessionInfo())

R version 3.6.3 (2020-02-29)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] data.table_1.12.2 lubridate_1.7.4   forcats_0.5.0     stringr_1.4.0    
 [5] dplyr_0.8.0.1     purrr_0.3.4       readr_1.4.0       tidyr_0.8.3      
 [9] tibble_2.1.1      ggplot2_3.1.1     tidyverse_1.2.1  

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-3     tidyselect_0.2.5 repr_0.19.2      haven_2.3.1     
 [5] lattice_0.20-38  colorspace_1.4-1 generics_0.0.2   vctrs_0.3.5     
 [9] htmltools_0.3.6  base64enc_0.1-3  rlang_0.4.9      pillar_1.4.7    
[13] glue_1.4.2       withr_2.1.2      modelr_0.1.8     readxl_1.3.1    
[17] uuid_0.1-2       lifecycle_0.2.0  plyr_1.8.4       munsell_0.5.0   
[21] gtable_0.3.0     cellranger_1.1.0 rvest_0.3.3      evaluate_0.13   
[25] broom_0.5.2      IRdisplay_0.7.0  Rcpp_1.0.1       scales_1.0.0    
[29] backports_1.1.4  IRkernel_0.8.15  jsonlite_1.6     hms_0.5.3       
[33] digest_0.6.18    stringi_1.4.3    grid_3.6.3       cli_1.1.0       
[37] tools_3.6.3      magrittr_1.5     lazyeval_0.2.2   crayon_1.3.4    
[41] pkgconfig_2.0.2  ellipsis_0.3.1   xml2_1.2.0       assertthat_0.2.1
[45] httr_1.4.0       rstudioapi_0.13  R6_2.4.0         nlme_3.1-139    
[49] compiler_3.6.3

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb	rleid
21.0	6	160	110	3.90	2.620	16.46	0	1	4	4	1
21.0	6	160	110	3.90	2.875	17.02	0	1	4	4	1
22.8	4	108	93	3.85	2.320	18.61	1	1	4	1	1
21.4	6	258	110	3.08	3.215	19.44	1	0	3	1	2
18.7	8	360	175	3.15	3.440	17.02	0	0	3	2	2
18.1	6	225	105	2.76	3.460	20.22	1	0	3	1	2

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb	rleid	row_number
21.0	6	160	110	3.90	2.620	16.46	0	1	4	4	1	1
21.0	6	160	110	3.90	2.875	17.02	0	1	4	4	1	2
22.8	4	108	93	3.85	2.320	18.61	1	1	4	1	1	3
21.4	6	258	110	3.08	3.215	19.44	1	0	3	1	2	1
18.7	8	360	175	3.15	3.440	17.02	0	0	3	2	2	2
18.1	6	225	105	2.76	3.460	20.22	1	0	3	1	2	3

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21.0	6	160	110	3.90	2.875	17.02	0	1	4	4
22.8	4	108	93	3.85	2.320	18.61	1	1	4	1
21.4	6	258	110	3.08	3.215	19.44	1	0	3	1

eventid	start	end
1	2016-10-01 04:30:00	2016-10-01 05:43:00
2	2016-10-01 18:02:00	2016-10-02 01:23:00
3	2016-10-02 14:21:00	2016-10-04 08:54:00

date	start	end
10/1/2016	2016-10-01	2016-10-01 23:59:59
10/2/2016	2016-10-02	2016-10-02 23:59:59
10/3/2016	2016-10-03	2016-10-03 23:59:59
10/4/2016	2016-10-04	2016-10-04 23:59:59
10/5/2016	2016-10-05	2016-10-05 23:59:59
10/6/2016	2016-10-06	2016-10-06 23:59:59

vec	out
1.0	0
2.0	0
4.0	0
5.0	1
5.0	1
5.0	1
6.1	0
6.0	0
6.2	0
-1.0	0
-1.0	0
-1.0	0
8.0	1
8.0	1
8.0	1

channel	type01	type02	type03	type04
c01	ahu1	ahu1	ahu1	disposal
c02	disposal	disposal	ahu2	ahu1
c03	ahu2	kitchen1	disposal	ahu2
c04	kitchen1	ahu2	kitchen1	kitchen1
c05	microwave	light	room	living

var_name	c01	c02	c03	c04	c05
type01	ahu1	disposal	ahu2	kitchen1	microwave
type02	ahu1	disposal	kitchen1	ahu2	light
type03	ahu1	ahu2	disposal	kitchen1	room
type04	disposal	ahu1	ahu2	kitchen1	living

change_id	change_time	action_time	interaction_id	interaction_time
c1	2020-11-10 15:05:00	NA	NA	NA
c2	2020-11-10 16:15:00	2020-11-10 16:17:30	i1	2020-11-10 16:17:30
c3	2020-11-10 16:20:00	2020-11-10 16:19:25	i2	2020-11-10 16:19:25
c4	2020-11-10 17:30:00	2020-11-10 17:31:30	i3	2020-11-10 17:31:30