#########################################################################
# Direct Small Area Estimation
# Use library laeken for point and variance estimation at domain level   
# Produce maps of the estimates
# Model-based estimation of averages using area level and unit level model with the sae package
# Model-based of non-linear indicators using the emdi package
# Applications to income data using the EU-SILC survey
#########################################################################
rm(list=ls())

setwd(paste0(R.home(), "/../workshop"))

# Loading libraries and the data
library(laeken)
library(emdi)
library(colorRamps)
library(gridExtra)
library(simFrame)
library(dplyr)
library(sae)
library(ggplot2)


# Datasets
data("eusilc")
data("eusilcP")
data("eusilcA_pop")
data("eusilcA_smp")

# Head Count Ratio with survey weights
hcr_national<-arpr("eqIncome", weights = "rb050", data = eusilc)


# Point and Variance estimation - At-risk-of-poverty rate

hcr_nuts2<- arpr("eqIncome", weights = "rb050", breakdown = "db040", data = eusilc)

hcr_var<-variance("eqIncome", weights = "rb050", breakdown = "db040", design = "db040", 
         data = eusilc, indicator = hcr_nuts2, bootType = "naive", seed = 123,R=500)



# HCR - breakdown by NUTS2 and gender groups
eusilc$genderregion<-interaction(eusilc$db040,eusilc$rb090)

hcr_nuts2_gender<-arpr("eqIncome", weights = "rb050", data = eusilc, breakdown="genderregion")
hcr_nuts2_gender


# Produce estimates of Quintile Share Ratio and visualise on a Map


Karte <- readRDS("AUT_adm1.rds")
Karte@data$NAME_1 <- c("Burgenland","Carinthia","Lower Austria","Upper Austria","Upper Austria",
                                "Salzburg","Styria","Tyrol","Vorarlberg","Vorarlberg","Vienna")

# Produce the map
est_qsr<-qsr("eqIncome", weights = "rb050", data = eusilc,breakdown="db040")
ind <- match(Karte@data$NAME_1,est_qsr$strata) # Matching
Karte@data$QSR <- est_qsr$valueByStratum[ind,2] # Add estimator to map

# Define the color of the maps
ramp <- colorRamp(c("green","yellow", "red","black"))
# Plot
#pdf(file="Direct_austria_qsr.pdf", width=5, height=5, family="Times")
spplot(Karte,c("QSR"),names.attr=c("QSR"),main="Quintile Share Ratio",
              col.regions=c(rgb(ramp(seq(0, 1, length = 200)), max = 255)),par.strip.text=list(lines=1.3),
              at=c(seq(min(Karte@data$QSR),max(Karte@data$QSR),length.out=100),Inf),colorkey=TRUE)
#dev.off()



# we filter by the main income holder per household to get household data
eusilcP_HH <- eusilcP[eusilcP$main, ]

# draw random rows in order to receive a sample from the population data set
set.seed(2)
sample_id <- sample(1:25000, 400, replace = FALSE, prob = NULL)
# Receive sample on houehold level corresponding to population data
eusilcS_HH <- eusilcP[sample_id, ]

# How are the sample observations distributed in the regions?
summary(eusilcS_HH$region)
# Direct estimation of mean using sae-package
fit_direct <- direct(y = eqIncome,dom = region, data = eusilcS_HH, replace=T)



# Aggregation of the covariates on region level
eusilcP_HH_agg<-tbl_df(eusilcP_HH) %>% 
  group_by(region) %>%
  summarise(hy090n = mean(hy090n)) %>%
  ungroup() %>% 
  mutate(Domain = fit_direct$Domain)

data_frame <- left_join(eusilcP_HH_agg, fit_direct, by = "Domain") %>% 
  mutate(var = SD^2)




# Estimation of the FH-model
fit_FH <- mseFH(formula = Direct ~ hy090n,vardir = var,
                data = as.data.frame(data_frame))

# Comparison CV of direct and FH
FH_CV <- 100*sqrt(fit_FH$mse) / fit_FH$est$eblup
results <- data.frame(fit_direct, 
                      FH_est = fit_FH$est$eblup,
                      FH_CV)



# Aggregation of the covariates on region level
eusilcP_HH_agg<-tbl_df((eusilcP_HH))%>%group_by((region))%>%summarise(py010n=mean(py010n),
                                                                      py050n=mean(py050n),
                                                                    hy090n=mean(hy090n),n=n())%>%
  ungroup()%>%mutate(Domain=fit_direct$Domain)
data_frame<-left_join(eusilcP_HH_agg,fit_direct,by="Domain")%>%mutate(var=SD^2)
data_frame<-as.data.frame(data_frame)

Xmean<-data.frame(region=data_frame[,1],hy090n=data_frame$hy090n,py010n=data_frame$py010n,py050n=data_frame$py050n)
Popsize<-data.frame(region=data_frame[,1],n=data_frame$n)

# Estimation of the Unit-level model (Battese-Harter-Fuller)
fit_EBLUP<-eblupBHF(formula=as.numeric(eqIncome)~py010n + py050n+hy090n,dom=region,data=eusilcS_HH,meanxpop=Xmean,popnsize=Popsize)

# MSE estimation of the Unit-level model 
MSE_EBLUP<-pbmseBHF(formula=as.numeric(eqIncome)~py010n + py050n+hy090n,dom=region,data=eusilcS_HH,meanxpop=Xmean,popnsize=Popsize)

# Comparison with direct estimator
EBLUP_CV<-100*sqrt(MSE_EBLUP$mse$mse)/fit_EBLUP$eblup$eblup
results<-data.frame(fit_EBLUP$eblup,EBLUP_CV)

# EBP estimation
set.seed(1)

#EBP with Box-Cox Transformation and bootstrap MSE 

emdi_model_box_cox <- ebp(fixed = eqIncome ~ gender + eqsize + cash +
                    self_empl + unempl_ben + age_ben + surv_ben + sick_ben + dis_ben + rent +
                    fam_allow + house_allow + cap_inv + tax_adj, pop_data = eusilcA_pop,
                  pop_domains = "district", smp_data = eusilcA_smp, smp_domains = "district",
                  threshold = 0.6*median(eusilcA_smp$eqIncome), transformation = "box.cox",
                  L = 50, MSE = TRUE, B = 50,custom_indicator =
                    list(my_max = function(y, threshold){max(y)},
                         my_min = function(y, threshold){min(y)}), na.rm = TRUE, cpus = 1)
summary(emdi_model_box_cox)
plot(emdi_model_box_cox)
#Output of the results
results=estimators(emdi_model_box_cox, indicator = "Median", MSE = TRUE)




load_shapeaustria()


# Create a suitable mapping table to use numerical identifiers of the shape
# file
# First find the right order
dom_ord <- match(shape_austria_dis@data$PB, emdi_model_box_cox$ind$Domain)
# Create the mapping table based on the order obtained above
map_tab <- data.frame(pop_data_id = emdi_model_box_cox$ind$Domain[dom_ord],
                      shape_id = shape_austria_dis@data$BKZ)
# Create map plot for mean indicator - point and CV estimates but no MSE
# using the numerical domain identifiers of the shape file
map_plot(object = emdi_model_box_cox, MSE = FALSE, CV = FALSE,
         map_obj = shape_austria_dis, indicator = c("Median"),
         map_dom_id = "BKZ", map_tab = map_tab)