# Script for fitting a linear regression model with interaction terms to output data from 4 LES-LM simulations

# Armin Sigmund


# Settings ------------------------------------------------------------------------
rm(list= ls())

# working directory
setwd("~/Documents/PhD_work/my_paper_drafts/paper_parametrization/data_for_EnviDat/Postprocessing_code/")
# functions
source("my_functions/func_model_selection.R")
# path to post-processed LES data
# path = "../LES-LM_main_results/"
files = c("particles_per_d_and_z_Case_1.Rdata",
          "particles_per_d_and_z_Case_2.Rdata",
          "particles_per_d_and_z_Case_3a.Rdata",
          "particles_per_d_and_z_Case_3b.Rdata")
# names of simulations specified in files
sim.names = paste("Case",c("1","2","3a","3b"))
## Characteristics of the 4 LES simulation (rows) for two 200-s periods during quasi-steady state, needed for the model fitting
source("config_model_fitting.R") # provides the variables ustar, delta.t.air, and rh

library(glmnet)
library(leaps)


# Read LES data post-processed in 2_prepare_input_for_lm.R -----------
d_p_bin = list()
dat     = list()
for(i in 1:length(files)){
  load( files[i] )
  d_p_bin[[i]] = d_p_center
  dat[[i]]     = avg.per.dp
}
names(dat) = names(d_p_bin) = sim.names

for(i in 1:2){ print(sapply(d_p_bin, function(x){length(x[[i]])})) }
sapply(dat, function(x){dim(x[[1]]$delta_T)})

# heights (m)
my.z = as.numeric(colnames(dat[[1]][[1]]$delta_T))  
# select heights to be considered in regression (lowest 0.3 m)
my.z = my.z[1:20]

# nested list of data.frames (x: simulations, k: time periods, i: heights)
my.data = lapply(1:length(dat), function(x){ # for each simulation
  # for each time period
  lapply(1:length(d_p_bin[[x]]), function(k){
    # extract certain range of diameter classes
    in.focus = which(d_p_bin[[x]][[k]] > 50e-6 & d_p_bin[[x]][[k]] < 500e-6)
    # for each height data.frames with change in particle temperature (K s-1)
    lapply(1:length(my.z), function(i){
      na.omit( data.frame(delta_T = dat[[x]][[k]]$delta_T[in.focus, i],
                          d_p = d_p_bin[[x]][[k]][in.focus], 
                          z = my.z[i], 
                          delta.t.air = delta.t.air[x, k], 
                          rh = rh[x, k], 
                          # delta.q = delta.q[x, k],
                          ustar = ustar[x, k]) )
    })
  })
})
names(my.data) = sim.names
# list of data frames, one data frame per simulation
dat.per.sim = lapply(my.data, function(x){
  do.call(rbind, lapply(x, function(k){
    do.call(rbind, k) 
  }))
})
# dataframe containing all original data
my.df.orig = do.call( rbind, dat.per.sim )
# transform variables
dat.trans = lapply(dat.per.sim, function(i){
  data.frame( abs_delta_T = abs(i$delta_T),
              d_p         = i$d_p, # sqrt_d_p    = sqrt(i$d_p), #
              z.trans     = log(i$z), #z.trans      = sqrt(i$z), #
              abs.delta.t.air = abs(i$delta.t.air),
              rh.trans        = sign(i$delta.t.air) * (i$rh - 100), # deviation from RH = 100% # make sure that unsaturated (oversaturated) air contributes with cooling (warming) effect if delta.t.air is zero
              ustar           = i$ustar )
})
# dataframe containing all transformed variables
my.df.trans = do.call( rbind, dat.trans )
# standardize input variables
my.mean = apply(my.df.trans, 2, mean)
my.sd   = apply(my.df.trans, 2, sd)
in.input = 2:NCOL(my.df.trans)
dat.standard = lapply(dat.trans, function(i){
  tmp = as.data.frame( sapply(in.input, function(x){ (i[,x] - my.mean[x]) / my.sd[x] }) )
  names(tmp) = names(i[,in.input])
  return(tmp)
})
my.df = do.call( rbind, dat.standard )
# define weights for regression
my.weight = -31.35889 * my.df.orig$z + 10.23519 # linear function of height between 10 at lowest height and 1 at uppermost height
for(i in 1:length(dat.per.sim)){
  dat.standard[[i]]$my.weight = -31.35889 * dat.per.sim[[i]]$z + 10.23519
}


# Variable selection based on best subset selection ----------------------------------------

# Regression formula: linear model with interaction up to second order
# Use absolute magnitude of temperature change because sign seems to be driven by sign of vertical temperature difference
my.formula = as.formula("my.df.trans$abs_delta_T ~ (d_p + z.trans + abs.delta.t.air + rh.trans)^2") # + ustar

# Consider models with all first-order terms and variable number of second-order terms (interaction terms)
# Fit best subset models to three out of four simulations and compute weighted MSE on the remaining simulation
# number of terms in full model
nvar = 10 #15
# allocate matrix for weighted MSE
MSR.weighted = matrix(data = NaN, nrow = length(dat.per.sim), ncol = nvar-4) #5)
# index of variable to be omitted
# Model selection by exhaustive search (determine the best one for each subset)
sub.mods = regsubsets(my.formula, data = my.df, nvmax = nvar, force.in = 1:4, nbest = 1, weights = my.weight)
# index of simulation used as test data
for (test.sim in 1:length(dat.standard)){
  # training data (other 3 simulations)
  dat.train         = do.call(rbind, dat.standard[-test.sim])
  abs_delta_T_train = do.call(c, lapply(dat.trans[-test.sim], function(i){ i$abs_delta_T }))
  # subset size
  for (k in 1:(nvar-4)){ #5)){
    # fit model
    formula.k = get_model_formula(id = k, object = sub.mods, outcome = "abs_delta_T_train")
    mod.k     = lm(formula.k, data = dat.train, weights = dat.train$my.weight)
    # print(paste("k =",k, ", test.sim =",test.sim,":"))
    # print(summary(mod.k))
    if(k %in% c(9,10)) print(paste("k = ",k))
    # test the model
    prediction.k = predict( object = mod.k, newdata = dat.standard[[test.sim]] )
    new.weights = dat.standard[[test.sim]]$my.weight
    true.values = dat.trans[[test.sim]]$abs_delta_T
    # weighted mean squared residuals 
    MSR.weighted[test.sim, k] = sum( new.weights * (prediction.k - true.values)^2 ) / sum(new.weights)
  }
}

# determine Akaike AIC criterion
my.aic = rep(NA, nvar-4) #5)
for (k in 1:(nvar-4)){ #5)){
  formula.k = get_model_formula(id = k, object = sub.mods, outcome = "my.df.trans$abs_delta_T")
  mod.k     = lm(formula.k, data = my.df, weights = my.weight)
  my.aic[k] = AIC(mod.k)
}

# Choose best model and fit to entire data set
my.id = 3 #5 #
formula.best   = get_model_formula(id = my.id, object = sub.mods, outcome = "my.df.trans$abs_delta_T")
mod.best       = lm(formula.best, data = my.df, weights = my.weight)
# repeat with non-standardized variables (same response) as the resulting coefficients are easier to implement in the sublimation parameterization
mod.best.trans = lm(formula.best, data = my.df.trans, weights = my.weight)
summary(mod.best)
summary(mod.best.trans)
mean( weighted.residuals(mod.best)^2 )
cor(mod.best$fitted.values, abs( my.df.orig$delta_T )) # 0.63
# save(mod.best, file = "lm_best_subset_dx10cm.Rdata")


# plot weighted MSR as function of subset size (number of interaction terms) ----------------------------------
# Figure S4

info.best = summary(sub.mods)
sub.size = as.numeric(rownames(info.best$which))
sub.size = sub.size - sub.size[1] + 1

pdf("MSR_vs_n_subset_regression_dx10cm.pdf", width = 8, height = 7)
par(mfrow = c(2,1), mar = c(0.3,4,1,4), oma = c(3,0,0.5,0))
my.msr = info.best$rss / sum(my.weight)
yrange1 = range(my.msr)
plot(sub.size, my.msr, type = "o", xlab = "", ylab = "", xaxt = "n", ylim = yrange1 + c(0, 0.05*diff(yrange1)))
title(ylab = expression("Weighted MSR"~(K^2~s^{-2})),line = 2.2)
axis(1, at = sub.size, labels = F)
abline(v=my.id, lty = 2, col = "darkgrey")
text(x = 1, y = yrange1[2] - 0.2*diff(yrange1), labels = "a", cex = 1.5, font = 2)
# Allow a second plot on the same graph
par(new=TRUE)
#count = count + 1
yrange = range(my.aic)
ylims  = yrange - c(0.05*diff(yrange), 0)
plot(x = sub.size, y = my.aic, type = "o", pch = 2, col = 2, ylim = ylims, axes = F, xlab="", ylab="")
axis(4, ylim=ylims, col=2,col.axis=2)#,las=1)
mtext("AIC",side=4,col=2,line=2.5)
legend("topright", legend = c("Weighted MSR", "AIC", "Selected option"), pch = c(1,2,NA), lty = c(1,1,2), col = c(1,2,"darkgrey"))
#
MSR.w.mean = apply(MSR.weighted, MARGIN=2, mean, na.rm = T)
# set last two values to NA because of "rank-deficient fit" (not enough data to fit all terms)
my.n = length(MSR.w.mean)
# MSR.w.mean[(my.n-1):my.n] = NA
MSR.w.sd   = apply(MSR.weighted, MARGIN=2, sd, na.rm = T)
plot(x = 1:my.n, MSR.w.mean, type = "o", ylim = range(c(0, MSR.w.mean[1:(my.n-1)] + MSR.w.sd[1:(my.n-1)]), na.rm = T), xlab = "",
     ylab = "", xaxt = "n")
axis(1, at = sub.size)
title(xlab = "Number of interaction terms", ylab = expression("Weighted MSR"~(K^2~s^{-2})), line = 2.2, xpd = NA)
lines(MSR.w.mean + MSR.w.sd, lty = 3, lwd = 1.3)
lines(MSR.w.mean - MSR.w.sd, lty = 3, lwd = 1.3)
abline(v=my.id, lty = 2, col = "darkgrey")
text(x = 1, y = 0.06, labels = "b", cex = 1.5, font = 2)
legend("bottomright", legend = expression(Average,Average%+-%sigma, Selected~option), 
       pch = c(1,NA,NA), lty = c(1,3,2), col = c(1,1,"darkgrey"), lwd = c(1,1,1.3), bg = "white")
dev.off()


# Plots of selected model --------------------------------------------------------
# Figures S5 and S6

rows.sim = sapply(dat.per.sim, NROW)
# color vector for different simulations
my.cols = c(1,2,4,"orange")

x.all = model.matrix(object = formula.best, data = my.df)[, -1]

my.pchs = c(1,2,4,5)
in.z.plot = c(1:4,12,20) #
in.plot = which(my.df.orig$z %in% my.z[in.z.plot[1:5]])
xlims = 1e6 *range(my.df.orig$d_p)
ylims = c( -0.05, max( my.df.trans$abs_delta_T[in.plot] ) ) 
my.df$caseID = do.call( c, lapply(1:4, function(i){ rep(i, rows.sim[i]) }) )
pchs = my.pchs[my.df$caseID]
cols = my.cols[my.df$caseID]

# complete plot for paper
pdf("Tp-change_vs_dp_with_best_subset_dx10cm.pdf", width = 8, height = 9)
n.panels = length(in.z.plot)
par(mfrow = c(n.panels/2, 2), mar = c(1,2,0.5,0.5), oma = c(3.5,3,0,0), cex.axis = 1.2, cex.lab = 1.2)
for(j in 1:n.panels){
  in.z1 = which(my.df.orig$z == my.z[ in.z.plot[j] ])
  l.bound = c(0, which(diff(in.z1) > 1), length(in.z1))
  xlabel = ifelse(j %in% (n.panels - c(1,0)), T, F)
  xlabs = ifelse(xlabel, expression(d[p]~(mu*m)), "")
  ylabel = T #ifelse(j %in% c(1,3), T, F)
  ylabs = ifelse(j %in% c(1,3,5), expression("|"*dT[p]/dt*"|"~(K~s^{-1})), "")
  my.ylim = ylims #if(j == 6){ ylims.last }else{ ylims }
  plot(1e6*my.df.orig$d_p[in.z1], my.df.trans$abs_delta_T[in.z1], pch = pchs[in.z1], col = cols[in.z1], cex = 0.6, 
       xlab = "", ylab = "", xlim = xlims, ylim = my.ylim, xaxt = "n", yaxt = "n")
  title(xlab = xlabs, ylab = ylabs, xpd = NA, cex = 1.2)
  axis(1, labels = xlabel); axis(2, labels = ylabel) 
  for(i in 1:(length(l.bound)-1)){
    in.show = in.z1[(l.bound[i]+1):l.bound[i+1]]
    lines(1e6*my.df.orig$d_p[in.show], predict(mod.best, newx = x.all)[in.show], col = my.cols[my.df$caseID[in.show[1]]])
  }
  if(j == 1) legend("topright", legend = c("1","2","3a","3b","Fit"), title = "Case", 
                    pch = c(my.pchs,NA), col = c(my.cols,1), lty = c(rep(NA,4),1), horiz = T, xpd = NA, cex = 1.2)
  lab.j = paste("(",bquote(.(letters[j])),") z = ",bquote(.(round(my.z[in.z.plot[j]],digits = 4)))," m", sep ="")
  y.txt.factor = c(0.22, rep(0.03, n.panels-2), 0.07)
  y.txt = my.ylim[2] - y.txt.factor[j]*diff(my.ylim)
  text(x = 100, y = y.txt, labels = lab.j, adj = c(0,0.5), cex = 1.2)
}
dev.off()

# plot Tp change as function of height for certain particle diameter
my.ylim = c(-0.05,1.15)
dp.plot = c(0.000055, 0.000125, 0.000195, 0.000265, 0.000375, 0.000495)
in.dp = lapply(dp.plot, function(i){ which(abs(my.df.orig$d_p - i) < 3e-6) })
pdf("Tp-change_vs_z_with_best_subset_dx10cm.pdf", width = 8, height = 9)
par(mfrow = c(n.panels/2, 2), mar = c(1,2,0.5,0.5), oma = c(3.5,3,0,0), cex.axis = 1.2, cex.lab = 1.2)
for(i in 1:length(dp.plot)){
  in.curr = in.dp[[i]]
  xlabel = ifelse(i %in% (n.panels - c(1,0)), T, F)
  xlabs = ifelse(xlabel, expression(z~(m)), "")
  ylabel = ifelse(i %in% c(1,3,5), T, F)
  ylabs = ifelse(i %in% c(1,3,5), expression("|"*dT[p]/dt*"|"~(K~s^{-1})), "")
  plot(my.df.orig$z[in.curr], my.df.trans$abs_delta_T[in.curr], pch = pchs[in.curr], col = cols[in.curr], cex = 0.6, 
       xlab = "", ylab = "", xaxt = "n", yaxt = "n", ylim = my.ylim) 
  axis(1, labels = xlabel); axis(2, labels = ylabel)
  title(xlab = xlabs, ylab = ylabs, xpd = NA, cex = 1.2)
  l.bound = c(0, which(diff(my.df.orig$z[in.curr]) < 0), length(in.curr))
  for(j in 1:(length(l.bound)-1)){
    in.show = in.curr[(l.bound[j]+1):l.bound[j+1]]
    lines(my.df.orig$z[in.show], predict(mod.best, newx = x.all)[in.show], col = my.cols[my.df$caseID[in.show[1]]])
  }
  if(i == 1) legend("topright", legend = c("1","2","3a","3b","Fit"), title = "Case", 
                    pch = c(my.pchs,NA), col = c(my.cols,1), lty = c(rep(NA,4),1), xpd = NA, cex = 1.2) # horiz = T
  text(x = 0.01, y = 1.05, labels = bquote("("*.(letters[i])*")"~d[p]~"="~.(dp.plot[i]*1e6)~mu*m), adj = c(0,0.5), cex = 1.2)
}
dev.off()


# Write training data and parameterized data to file ----------------------

# create vector with time period label (1 or 2)
n.tp = lapply( my.data, function(i){ 
  sapply(i, function(j){
    sum(sapply(j, NROW))
  })
})
period = do.call(c, lapply(n.tp, function(i){ c(rep(1, i[1]), rep(2, i[2])) }))
df.out = data.frame(Case = as.character(my.df$caseID), 
                    period,
                    dTpdt_abs       = round(my.df.trans$abs_delta_T, digits = 4),
                    dTpdt_abs_param = round(predict(mod.best, newx = x.all), digits = 4),
                    my.df.orig[,2:5]) #6])
df.out$Case[ df.out$Case == "3" ] = "3a"
df.out$Case[ df.out$Case == "4" ] = "3b"
colnames(df.out)[5:NCOL(df.out)] = c("dp","z","delta_T","RH_1") #,"ustar")
write.csv(df.out, file = paste("dTpdt-abs_bin-avg_regression.csv",sep=""), na = "NaN", row.names = F)



