# Estimation of a causal effect of an agricultural practice based on observational data
# Korgan Aldebert (Acta), François Brun (Acta), David Makowski (Inrae), 2023-12-18
# Fertigation data : part 1. matching methods

library(ggplot2)
library(gridExtra)
library(Matching)
library(cobalt)

# Loading the fertigation data. The data present a bias in the amount of irrigation 
# depending on the implementation of fertigation (see part 1)
# the data are simulated, thus we know the "true" effects 
# intercept: 72.94 coeff_Drip : 8.49 coeff_irrigation Irrigation : 0.02904 coeff_nitrogen : 0.1
dfbiais = read.table("Drip_Fertigation_biais.csv", sep=";",header=T)
dfbiais$treat = ifelse(dfbiais$Drip=="Yes",1,0)

# to remove - for reproducible results only
set.seed(1) # 

################################################################################
# Step 1. Propensity score estimation using logistic regression
# Assuming we have an expert to identify the relevant variables 
# that are sources of confusion : Irrigation and Nitrogen
formule = c("treat ~ Irrigation + Nitrogen") 

# we want to predict the probability (binomial with a logit link) to have a treatement according to the variables
reg = glm(formule, family = binomial(link = "logit"), data=dfbiais)
dfbiais$pscore = predict(reg, type='response')
summary(reg)


################################################################################
# This approach estimates the effect on a smaller sample of the initial population, 
# particular attention should be paid to the characteristics of the population after matching
# alternative with the Matching package
# Step 2bis. 2nd possibility : Matching method with the Matching packages (more option and more efficient)

# here, we use the ATE
# The average treatment effect (ATE) is used when we are interested in the average treatment of the entire population, 
# whereas the average treatment effect of the treated (ATT) is used when we are only interested in the average treatment effect of those treated

set.seed(123)
Appariement = Match(Y = NULL, Tr = dfbiais$treat, 
                    X = dfbiais$pscore, 
                    estimand = "ATE", # sample average treatment effect for all the individuals (treated and control)
                    M = 2, # number of neighbours (2 controle for 1 treated)
                    caliper = 0.20, # standard deviation coefficient, distance threshold for matching 
                    replace=TRUE, # with replacement
                    ties=FALSE) # If an observation matches with several controle, we include them all

################################################################################
# Step 3. Comparison of the dataset before and after matching
# This approach estimates the effect on a smaller sample of the initial population, 
# particular attention should be paid to the characteristics of the population after matching

# Calculation of covariate equilibria with the cobalt package
# ATE => s.d.denom = "pooled"
balApp = bal.tab(Appariement,formula = treat ~ Irrigation + Nitrogen + pscore, data = dfbiais, 
                 un = TRUE, disp.v.ratio = TRUE, s.d.denom="pooled", quick = FALSE)
# In the form of a table, this also shows us the number of matched individuals; 
# 104 treated individuals for 47 controls. As the matching was with a discount, this tells us that a large proportion of the controls were in fact assigned several times. 
# indicates that a large proportion of the controls are in fact assigned several times.
balApp

# Verification of computation between bal.tab and the manual formula
#The computed spread bal.tab() uses is always that of the full, unadjusted sample (i.e., before matching, weighting, or subclassification), as recommended by Stuart (2010).
# ATE => pooled SD
var = "Irrigation"
num.un = mean(dfbiais[dfbiais$treat==1,var]) - mean(dfbiais[dfbiais$treat==0,var])
denom.un =  sqrt(1/2*(sd(dfbiais[dfbiais$treat==0,var]))^2 + 1/2*(sd(dfbiais[dfbiais$treat==1,var]))^2)
format(round(num.un/denom.un,4),nsmall=4)
 
num.adj =mean(dfbiais[Appariement$index.treated,var]) - mean(dfbiais[Appariement$index.control,var] )
format(round(num.adj/denom.un,4),nsmall=4)
#not recommended denom.adj = sqrt(1/2*(var(dfbiais[Appariement$index.treated,"Irrigation"])) + 1/2*(var(dfbiais[Appariement$index.control,"Irrigation"])))


# All the variable as a graph
love.plot(x = balApp, stat = "mean.diffs", abs = TRUE, var.order = "unadjusted", 
          threshold = 0.2, shape = 23)

# density curves
bal.plot(Appariement, formula = treat ~ Yield, data = dfbiais, var.name = "Yield", which = "both", colors=c("skyblue", "salmon"))
bal.plot(Appariement, formula = treat ~ pscore, data = dfbiais, var.name = "pscore", which = "both", colors=c("skyblue", "salmon"))
bal.plot(Appariement, formula = treat ~ Irrigation, data = dfbiais, var.name = "Irrigation", which = "both", colors=c("skyblue", "salmon"))
bal.plot(Appariement, formula = treat ~ Nitrogen, data = dfbiais, var.name = "Nitrogen", which = "both", colors=c("skyblue", "salmon"))

################################################################################
# Step 4. The result : comparison of the means of our variable of interest between the two populations of individuals
# Two equivalent ways of estimating the effect; directly by the summary, 
# or by retrieving the indices of the new population from the Matching object.
# Both estimates are based on a Paired t-test for comparison of means.
summary(Appariement)
t.test(dfbiais[Appariement$index.treated,]$Yield,
       dfbiais[Appariement$index.control,]$Yield, paired = T)

# end of file

