########LOADING DATA
library(dplyr)
library(tidyr)
library(readr)
source("myFunctions.R")

wb <- read.csv("wbFinal.csv", stringsAsFactors = FALSE,
               na.strings = c(NA, "", "NA",".."))
wb <- wb[,-1]

###check to ensure integrity of dataframe
##More summary statws - counts of all variables per year
wbYr <- wb  %>% group_by(year)  %>%
  summarise_each(funs(countNA))
wb_countID <- wb %>% group_by(id, year) %>% 
  summarise(n=n())
wb_overCount <- wb_countID[which(wb_countID$n>1),]

##Create income category
wb$income <- NA
wb$income[which(wb$gdp_pc_constUSD2010 < 1036)] <- "Low Income"
wb$income[which(wb$gdp_pc_constUSD2010 > 12615)] <- "High Income"
wb$income[which(wb$gdp_pc_constUSD2010 > 1035 & wb$gdp_pc_constUSD2010 < 12616)] <- "Middle Income"

## Military Expenditures as % of GDP
##Current LCU - military/GDP
wb$mspend_gdp_lcu <- 100*wb$sipri_lcu / wb$gdp_curr_LCU
wb$mspend_gdp_lcu[which(!is.finite(wb$mspend_gdp_lcu))] <- NA



##Logging all Vars - took extra steps to add 1 to variables whose minimum is zero;
##Also used a function 'logNeg' I wrote to change variables with negative values, but not needed here.
vars <- names(wb)[-c(which(names(wb)=="year"))]
wbT <- wb
for(v in vars){
  if(is.numeric(wbT[,v]) & length(!is.na(wbT[,v]))>0){
    if(min(wbT[,v], na.rm=T)>0){
      wbT[,v] <- log(wbT[,v])
    } else {
      if(min(wbT[,v], na.rm=T)==0){
        wbT[,v] <- log(wbT[,v]+1)
      } else {
        if(min(wbT[,v], na.rm=T)<0){
          wbT[,c("year",v)] <- wbT[,c("year",v)] %>%
            group_by(year) %>%
            mutate_each(funs(logNeg)) %>%
            data.frame()
        }
      }}}}

LICs <- sort(unique(wb$id[which(wb$income == "Low Income")]))
HICs <- sort(unique(wb$id[which(wb$income == "High Income")]))
MICs <- sort(unique(wb$id[which(wb$income == "Middle Income")]))
##select only HICs that were never counted as middle income
HIC_i <-  setdiff(HICs, MICs)
MIC_i <-  setdiff(MICs, LICs)

df <- wbT
vars <- c("id", 
          "year", 
          "co2_pc", 
          "gdp_pc_constUSD2010",
          "urbanPop_prcnt", 
          "mspend_gdp_lcu",
          "income"
          )


df <- df[,vars]

##Labels
library(Hmisc)
dfLabels <- c("Country", 
              "Year", 
              "Territorial emissions in tCO2 per person",
              "GDP per capita", 
              "Urban population as % of total population", 
              "Military expenditures as % GDP",
              "IMF Income Category"
              )

c <- ncol(df)
for(i in 1:c){
  Hmisc::label(df[[i]]) <- dfLabels[i]
}



##After adding labels, rename using abbreviated
names(df) <- c("ID", "YR", "CO2", "GDP", "URBAN", "MIL", "income"
               )

df <- df[complete.cases(df),]

##I want to know for each year, starting in 2014 and working backwards, how many countries have all data available
##e.g. 2014, then 2014-2013, 2014-2012, ... 2014-1960.  If using balanced panel data, the total set
##will become smaller the further back in time the series extends.
mf_year <- df %>% dplyr::group_by(YR) %>% summarise(n=n())
mf_year$balanced <- NA
mf_year$n_LIC <- NA
mf_year$n_HIC <- NA
mf_year_id <- df %>% dplyr::group_by(YR, ID) %>% summarise(n=n(), income=first(income))
for(y in 2014:1960){
  c <- unique(mf_year_id$ID[which(mf_year_id$YR==y)]) 
  c_LIC <- unique(mf_year_id$ID[which(mf_year_id$YR==y & mf_year_id$income=="Low Income")])
  c_HIC <- unique(mf_year_id$ID[which(mf_year_id$YR==y & mf_year_id$income=="High Income")])
  if(y == 2014){
    cList <- c
    cList_LIC <- c_LIC
    cList_HIC <- c_HIC
  } else {
    #cList is intersection of previous country vector/list
    cList <- intersect(c, cList)
    #Taking countries from the balanced panel list

    cList_LIC <- c_LIC %in% cList
    cList_HIC <- c_HIC %in% cList
  }
  mf_year$balanced[which(mf_year$YR==y)] <- length(cList)
  mf_year$n_LIC[which(mf_year$YR==y)] <- length(cList_LIC)
  mf_year$n_HIC[which(mf_year$YR==y)] <- length(cList_HIC)
}
##Dataset attenuates from n=149 in 2014 to n=41 in 1960 (!)


#Changing Income Var to dichotomous
df$income[which(df$income == "High Income")] <- "High and Middle Income"
df$income[which(df$income == "Middle Income")] <- "High and Middle Income"
df$income <- factor(df$income, levels=c("High and Middle Income", "Low Income"))

##Balanced & Unbalanced panels
mf <- df %>% dplyr::group_by(ID) %>% summarise(n=n())
mf_ub <- mf %>% dplyr::filter(n>1) ##all countries have at least 2 observations, n=161
countryList_ub <- unique(mf_ub$ID)  
df_ub <- df %>% dplyr::filter(ID %in% countryList_ub)  ##unbalanced panel
bNum <- length(unique(df$YR)) ##number of years, balanced = all countries show up every year
mf <- mf %>% dplyr::filter(n>=bNum-1) ##allowing for countries with one less observation  
countryList <- unique(mf$ID)  
df <- df %>% dplyr::filter(ID %in% countryList)  ##Balanced panel

##exporting data to Stata for analysis
library(foreign)
#write.dta(df,file="results/df.dta")   
#write.dta(df_ub,file="results/df_ub.dta")   
#val.labels=c( "State", "Per Capita Income", "Population") )

