• R/O
  • SSH

Tags
Keine Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. a0fb89acee5ffef9bbb62447d795fa7e0bb73bb1
Größe 14,108 Bytes
Zeit 2021-01-27 22:35:23
Autor Lorenzo Isella
Log Message

I removed the dependency from the stat_lib.R .

Content

rm(list=ls())


## last saved on  Time-stamp: "2021-01-27 14:32:37 lorenzo"







library(tidyverse)
library(janitor)
library(openxlsx)
library(janitor)


## source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")


hhi_index <- function(x){

    y <- x/sum(x)

    res <- sum(y^2)
    return(res)
    
}



na_to_pattern <- function(df, pattern){

res <- df %>% replace(., is.na(.), pattern)

return(res)    
    
}


add_total <- function(x, pos=1, ...){
    adorn_totals(x, ...) %>% as_tibble %>%
    move_row(nrow(.), pos)    
}



move_row <- function(df, ini_pos, fin_pos){

ll <- nrow(df)
    
row_pick <- slice(df, ini_pos)

    if (fin_pos=="last"){

           res <- df %>%
        slice(-ini_pos) %>% 
        add_row(row_pick, .before = ll)    
 
        
} else{
    
    res <- df %>%
        slice(-ini_pos) %>% 
        add_row(row_pick, .before = fin_pos)    
}

    return(res)
}



save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
tryCatch({

    wb <- loadWorkbook(fileName)
    addWorksheet(wb = wb, sheet = sheetName)
  writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
       keepNA=na_yes    ,...)
   saveWorkbook(wb = wb, file = fileName, overwrite = T)
}, 
error = function(err){

  wb <- createWorkbook(fileName)
  addWorksheet(wb = wb, sheet = sheetName)
    writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes ,  ... )
  saveWorkbook(wb = wb, file = fileName, overwrite = T)  
})
}



'%!in%' <- function(x,y)!('%in%'(x,y))


rem_dupl_cols <- function(df){

res <- df[, !duplicated(t(df))]

return(res)

}


rem_const_cols <- function(df){
## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]


## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]

    res <- df %>%
        select(where(~length(unique(na.omit(.x))) > 1))

    

return(res)

}



clean_data <- function(x){

    res <- x %>%
        clean_names() %>%
        remove_empty() %>%
        distinct() %>%
        rem_dupl_cols() %>%
        rem_const_cols()

    return(res)

    
}
    
substrLeft <- function(x, n){
  substr(x,1, n )
}






n_cat <- 20 ### number of values for the discretized ranking

n_top <- 500 ## number of products to keep

eu_list <- c("BE", "BG","CZ", "DK", "DE", "EE", "IE", "GR",
                   "ES", "FR", "IT", "CY", "LV", "LT", "LU", "HU", "MT",
             "NL", "AT","PL", "PT", "RO", "SI", "SK", "FI","SE", "HR" ) %>%
    as_tibble()


               

## df_ini <- read_csv("./input_files/BACI_HS17_Y2018_V202001.csv",col_types = cols(i="c", j="c")) %>%
##     mutate(i=str_pad(i, 3,"left", "0"),
##            j=str_pad(j,3, "left","0" ))

## saveRDS(df_ini, "baci_set.RDS")


df_ini <- readRDS("baci_set.RDS")



prod_codes <- read_csv("./input_files/product_codes_HS17_V202001.csv",
                       col_types=c(.default="c")) %>%
    mutate(code=str_pad(code,6, "left", "0"))

codes <- read_csv("./input_files/country_codes_V202001.csv") %>%
    clean_data() %>%
    mutate(country_code=str_pad(country_code,3, "left", "0"))

codes_sel <- codes %>%
    select(country_code, country_name_full)


codes_eu <- codes %>%
    right_join(y=eu_list, by=c("iso_2digit_alpha"="value")) %>%
    filter(complete.cases(.)) %>% 
    select(country_code, iso_2digit_alpha)




df_imports_raw <- df_ini %>%
    select(-q) %>% 
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t)) %>%
    group_by(j,k) %>% ## this means: for every product and EU country
                      
    summarise(N=n(), i_value=sum(v, na.rm=T)) %>% #count the number of
                                               #foreign suppliers.
    ungroup %>%

## by now I got rid of the supplier index i and I see only the statistics
 ## for the product k and the EU importing MS j
    group_by(k) %>%   
    mutate( i_share=i_value/sum(i_value)) %>% 
    ungroup %>%  ## Now by grouping by k I determine what share of imports of product k goes to each importing MS
    mutate(N_eu=N*i_share)




df_i_share <- df_imports_raw %>%
    select(j,k,i_share)


save_excel(df_i_share, "Extra_EU27_import_share.xlsx")


### Now I calculate index 1.1 of the report for the imports
df_i_N_eu <- df_imports_raw %>%
    group_by(k) %>%
    summarise(N_eu_fin=sum(N_eu))  %>%
    ungroup %>% ## for every product I sum the number of suppliers for each MS weighted by the share of trade of each MS.
    arrange(N_eu_fin) %>%
    left_join(y=prod_codes, by=c("k"="code"))%>%
    select(description, everything())






### Now I start the calculation of index 1.2 of the report for the
##  

df_i_hhi <- df_ini %>%
    select(-q) %>% 
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t)) %>% ##again, all the exports to EU MS from non-EU countries
    group_by(j, k) %>% ## I group on products and importing MS
    summarise(hhi=hhi_index(v)) %>% ## and I calculate the HHI on the value of the imports of each MS from non-EU countries
    ungroup %>%
    left_join(y=df_i_share, by=c("j", "k"))  ##and I reuse the shares which I calculated before (which are nothing else but the share of imports of MS j of product k based on the total imports of k from abroad to the EU)

df_i_hhi_eu <- df_i_hhi %>%
    mutate(hhi_eu=hhi*i_share) %>%
    group_by(k) %>%
    summarise(hhi_eu_fin=sum(hhi_eu)) %>% ## the value of the HHI for the EU is again the weighted sum of those for all the MS.
    ungroup %>%
    arrange(desc(hhi_eu_fin))%>%
    left_join(y=prod_codes, by=c("k"="code"))%>%
    select(-description)



### Now indicator 2.2



df_import_price <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t)) %>%
    group_by(k) %>%
    summarise(import_val=sum(v, na.rm=T),
              import_qty=sum(q, na.rm=T)) %>%
    ungroup %>%
    mutate(import_price=import_val/import_qty)




df_export_price <- df_ini %>%
    filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
           t==max(t)) %>%
    group_by(k) %>%
    summarise(export_val=sum(v, na.rm=T),
              export_qty=sum(q, na.rm=T)) %>%
    ungroup %>%
    mutate(export_price=export_val/export_qty)



df_import_export_price <- df_import_price %>%
    full_join(df_export_price, by="k") %>%
    mutate(price_diff=abs(export_price-import_price)/import_price)


df_price <- df_import_export_price %>%
    select(k, price_diff)

########################################
### Now indicator 2.1

df_intra_eu_export_qty<- df_ini %>%
    filter(i %in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t)) %>%
    group_by(k) %>%
    summarise(intra_eu_export_qty=sum(q, na.rm=T)) %>%
    ungroup ## %>%
    ## mutate(export_price=export_val/export_qty)


df_import_export_qty <- df_import_export_price %>%
    select(k, import_qty, export_qty)



df_production <- df_import_export_qty %>%
    full_join(df_intra_eu_export_qty, by="k") %>%
    mutate(production_capacity=import_qty/(export_qty+intra_eu_export_qty))


df_production_indicator <- df_production %>%
    select(k, production_capacity)



### How to rank the indicators

### N_eu: from small (few foreign suppliers) to large (many foreign suppliers)
### hhi: from large (high concentration of the imports in the hands of a few foreign suppliers ) to low
### production_capacity: from high (large ratio imports to exports i.e. low capacity for production) to low
## price diff: from high to low (products with a large gap between import and export prices are not supposed to be easy to replace )


df_out <- df_i_N_eu %>%
    full_join(df_i_hhi_eu, by="k") %>%
    full_join(df_production_indicator, by="k") %>%
    full_join(df_price, by="k") %>%
    filter(complete.cases(.)) %>% 
    mutate(rank_N=dense_rank(N_eu_fin),
           rank_hhi=dense_rank(desc(hhi_eu_fin)),
           rank_production=dense_rank(desc(production_capacity)),
           rank_price_diff=dense_rank(desc(price_diff))
           ) %>% 
    mutate(rank_N_discrete=ntile(N_eu_fin, n_cat),
           rank_hhi_discrete=ntile(desc(hhi_eu_fin), n_cat),
           rank_production_discrete=ntile(desc(production_capacity), n_cat),
           rank_price_diff_discrete=ntile(desc(price_diff), n_cat)
           )  %>%
    mutate(indicator_diversification=(rank_N+rank_hhi)/2.,
           indicator_substitutability=(rank_production+rank_price_diff)/2.,
           indicator_diversification_discrete=(rank_N_discrete+rank_hhi_discrete)/2.,
           indicator_substitutability_discrete=(rank_production_discrete+
                                                rank_price_diff_discrete)/2.
           ) %>%
    mutate(final_indicator=(indicator_diversification+
                            indicator_substitutability)/2.,
           final_indicator_discrete=(indicator_diversification_discrete+
                            indicator_substitutability_discrete)/2.
           
           ) 


save_excel(df_out, "revised_calculations.xlsx")


## df_corr <- df_out %>%
##     select(N_eu_fin:production_capacity) %>%
##     cor()


df_stat <- df_out %>%
    select(description, k, final_indicator, final_indicator_discrete)




df_top_discrete <- df_stat %>%
    select(-final_indicator) %>%
    arrange(final_indicator_discrete) %>%
    slice(1:n_top) %>%
    mutate(hs2=substrLeft(k,2),
           hs4=substrLeft(k,4))

save_excel(df_top_discrete, "top_products_discrete_ranking.xlsx")


df_top <- df_stat %>%
    select(-final_indicator_discrete) %>%
    arrange(final_indicator) %>%
    slice(1:n_top) %>%
    mutate(hs2=substrLeft(k,2),
           hs4=substrLeft(k,4))


save_excel(df_top, "top_products_ranking.xlsx")



hs2_top_discrete <- df_top_discrete %>%
    pull(hs2) %>%
    tabyl() %>%
    arrange(desc(n)) %>%
    mutate(cumulative=cumsum(percent))



hs2_top <- df_top %>%
    pull(hs2) %>%
    tabyl() %>%
    arrange(desc(n)) %>%
    mutate(cumulative=cumsum(percent))


#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################


### Now some tests on the calculated quantities


df_N_max <- df_i_N_eu %>%
    filter(N_eu_fin==max(N_eu_fin))

k_N_max <- df_N_max %>%
    pull(k)




test1 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==k_N_max)  %>%
    group_by(j) %>%
    summarise(n=n(), n2=length(unique(i))) %>%
    ungroup


test2 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==k_N_max)  %>%
    group_by(j) %>%
    summarise(val=sum(v, na.rm=T)) %>%
    ungroup %>%
    mutate(share=val/sum(val))



test12 <- test1 %>%
    full_join(y=test2, by="j") %>%
    mutate(n_share=n*share) %>%
    pull(n_share) %>%
    sum()



##############################

df_hhi_min <- df_i_hhi_eu %>%
    filter(hhi_eu_fin==min(hhi_eu_fin))


hhi_min <- df_hhi_min %>%
    pull(k)

test3 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==hhi_min) %>%
    group_by(j) %>%
    summarise(hhi=hhi_index(v)) %>%
    ungroup




test4 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==hhi_min) %>%
    group_by(j) %>%
    summarise(val=sum(v, na.rm=T)) %>%
    ungroup %>%
    mutate(share=val/sum(val))


test34 <- test3 %>%
    full_join(test4, by="j") %>%
    mutate(hhi_share=hhi*share) %>%
    pull(hhi_share) %>%
    sum()

########################################################################
########################################################################
########################################################################
########################################################################
########################################################################


df_price_max <- df_price %>%
    filter(price_diff==max(price_diff, na.rm=T))


price_max <- df_price_max %>%
    pull(k)



test5 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==price_max) %>%
    summarise(i_val=sum(v, na.rm=T),
              i_qty=sum(q, na.rm=T)) %>%
    ungroup %>%
    mutate(i_price=i_val/i_qty)



test6 <- df_ini %>%
    filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
           t==max(t), k==price_max) %>%
    summarise(e_val=sum(v, na.rm=T),
              e_qty=sum(q, na.rm=T)) %>%
    ungroup %>%
    mutate(e_price=e_val/e_qty)


test56 <- abs(test6$e_price-test5$i_price)/test5$i_price
 

######################################################################
######################################################################
######################################################################
######################################################################
######################################################################


df_production_max <- df_production %>%
    filter(production_capacity==max(production_capacity, na.rm=T))


max_prod <- df_production_max %>%
    pull(k)


test7 <- df_ini %>%
    filter(i %in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==max_prod) %>%
    summarise( intra_eu_e_qty=sum(q, na.rm=T)) %>%
    ungroup 

test8 <- df_ini %>%
    filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
           t==max(t), k==max_prod) %>%
    summarise( i_qty=sum(q, na.rm=T)) %>%
    ungroup  
    




test9 <- df_ini %>%
    filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
           t==max(t), k==max_prod) %>%
    summarise( e_qty=sum(q, na.rm=T)) %>%
    ungroup 


test789 <- test8$i_qty/(test9$e_qty+test7$intra_eu_e_qty)





print("So far so good")