Rev. | a0fb89acee5ffef9bbb62447d795fa7e0bb73bb1 |
---|---|
Größe | 14,108 Bytes |
Zeit | 2021-01-27 22:35:23 |
Autor | Lorenzo Isella |
Log Message | I removed the dependency from the stat_lib.R . |
rm(list=ls())
## last saved on Time-stamp: "2021-01-27 14:32:37 lorenzo"
library(tidyverse)
library(janitor)
library(openxlsx)
library(janitor)
## source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
hhi_index <- function(x){
y <- x/sum(x)
res <- sum(y^2)
return(res)
}
na_to_pattern <- function(df, pattern){
res <- df %>% replace(., is.na(.), pattern)
return(res)
}
add_total <- function(x, pos=1, ...){
adorn_totals(x, ...) %>% as_tibble %>%
move_row(nrow(.), pos)
}
move_row <- function(df, ini_pos, fin_pos){
ll <- nrow(df)
row_pick <- slice(df, ini_pos)
if (fin_pos=="last"){
res <- df %>%
slice(-ini_pos) %>%
add_row(row_pick, .before = ll)
} else{
res <- df %>%
slice(-ini_pos) %>%
add_row(row_pick, .before = fin_pos)
}
return(res)
}
save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
tryCatch({
wb <- loadWorkbook(fileName)
addWorksheet(wb = wb, sheet = sheetName)
writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
keepNA=na_yes ,...)
saveWorkbook(wb = wb, file = fileName, overwrite = T)
},
error = function(err){
wb <- createWorkbook(fileName)
addWorksheet(wb = wb, sheet = sheetName)
writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
saveWorkbook(wb = wb, file = fileName, overwrite = T)
})
}
'%!in%' <- function(x,y)!('%in%'(x,y))
rem_dupl_cols <- function(df){
res <- df[, !duplicated(t(df))]
return(res)
}
rem_const_cols <- function(df){
## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
res <- df %>%
select(where(~length(unique(na.omit(.x))) > 1))
return(res)
}
clean_data <- function(x){
res <- x %>%
clean_names() %>%
remove_empty() %>%
distinct() %>%
rem_dupl_cols() %>%
rem_const_cols()
return(res)
}
substrLeft <- function(x, n){
substr(x,1, n )
}
n_cat <- 20 ### number of values for the discretized ranking
n_top <- 500 ## number of products to keep
eu_list <- c("BE", "BG","CZ", "DK", "DE", "EE", "IE", "GR",
"ES", "FR", "IT", "CY", "LV", "LT", "LU", "HU", "MT",
"NL", "AT","PL", "PT", "RO", "SI", "SK", "FI","SE", "HR" ) %>%
as_tibble()
## df_ini <- read_csv("./input_files/BACI_HS17_Y2018_V202001.csv",col_types = cols(i="c", j="c")) %>%
## mutate(i=str_pad(i, 3,"left", "0"),
## j=str_pad(j,3, "left","0" ))
## saveRDS(df_ini, "baci_set.RDS")
df_ini <- readRDS("baci_set.RDS")
prod_codes <- read_csv("./input_files/product_codes_HS17_V202001.csv",
col_types=c(.default="c")) %>%
mutate(code=str_pad(code,6, "left", "0"))
codes <- read_csv("./input_files/country_codes_V202001.csv") %>%
clean_data() %>%
mutate(country_code=str_pad(country_code,3, "left", "0"))
codes_sel <- codes %>%
select(country_code, country_name_full)
codes_eu <- codes %>%
right_join(y=eu_list, by=c("iso_2digit_alpha"="value")) %>%
filter(complete.cases(.)) %>%
select(country_code, iso_2digit_alpha)
df_imports_raw <- df_ini %>%
select(-q) %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t)) %>%
group_by(j,k) %>% ## this means: for every product and EU country
summarise(N=n(), i_value=sum(v, na.rm=T)) %>% #count the number of
#foreign suppliers.
ungroup %>%
## by now I got rid of the supplier index i and I see only the statistics
## for the product k and the EU importing MS j
group_by(k) %>%
mutate( i_share=i_value/sum(i_value)) %>%
ungroup %>% ## Now by grouping by k I determine what share of imports of product k goes to each importing MS
mutate(N_eu=N*i_share)
df_i_share <- df_imports_raw %>%
select(j,k,i_share)
save_excel(df_i_share, "Extra_EU27_import_share.xlsx")
### Now I calculate index 1.1 of the report for the imports
df_i_N_eu <- df_imports_raw %>%
group_by(k) %>%
summarise(N_eu_fin=sum(N_eu)) %>%
ungroup %>% ## for every product I sum the number of suppliers for each MS weighted by the share of trade of each MS.
arrange(N_eu_fin) %>%
left_join(y=prod_codes, by=c("k"="code"))%>%
select(description, everything())
### Now I start the calculation of index 1.2 of the report for the
##
df_i_hhi <- df_ini %>%
select(-q) %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t)) %>% ##again, all the exports to EU MS from non-EU countries
group_by(j, k) %>% ## I group on products and importing MS
summarise(hhi=hhi_index(v)) %>% ## and I calculate the HHI on the value of the imports of each MS from non-EU countries
ungroup %>%
left_join(y=df_i_share, by=c("j", "k")) ##and I reuse the shares which I calculated before (which are nothing else but the share of imports of MS j of product k based on the total imports of k from abroad to the EU)
df_i_hhi_eu <- df_i_hhi %>%
mutate(hhi_eu=hhi*i_share) %>%
group_by(k) %>%
summarise(hhi_eu_fin=sum(hhi_eu)) %>% ## the value of the HHI for the EU is again the weighted sum of those for all the MS.
ungroup %>%
arrange(desc(hhi_eu_fin))%>%
left_join(y=prod_codes, by=c("k"="code"))%>%
select(-description)
### Now indicator 2.2
df_import_price <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t)) %>%
group_by(k) %>%
summarise(import_val=sum(v, na.rm=T),
import_qty=sum(q, na.rm=T)) %>%
ungroup %>%
mutate(import_price=import_val/import_qty)
df_export_price <- df_ini %>%
filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
t==max(t)) %>%
group_by(k) %>%
summarise(export_val=sum(v, na.rm=T),
export_qty=sum(q, na.rm=T)) %>%
ungroup %>%
mutate(export_price=export_val/export_qty)
df_import_export_price <- df_import_price %>%
full_join(df_export_price, by="k") %>%
mutate(price_diff=abs(export_price-import_price)/import_price)
df_price <- df_import_export_price %>%
select(k, price_diff)
########################################
### Now indicator 2.1
df_intra_eu_export_qty<- df_ini %>%
filter(i %in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t)) %>%
group_by(k) %>%
summarise(intra_eu_export_qty=sum(q, na.rm=T)) %>%
ungroup ## %>%
## mutate(export_price=export_val/export_qty)
df_import_export_qty <- df_import_export_price %>%
select(k, import_qty, export_qty)
df_production <- df_import_export_qty %>%
full_join(df_intra_eu_export_qty, by="k") %>%
mutate(production_capacity=import_qty/(export_qty+intra_eu_export_qty))
df_production_indicator <- df_production %>%
select(k, production_capacity)
### How to rank the indicators
### N_eu: from small (few foreign suppliers) to large (many foreign suppliers)
### hhi: from large (high concentration of the imports in the hands of a few foreign suppliers ) to low
### production_capacity: from high (large ratio imports to exports i.e. low capacity for production) to low
## price diff: from high to low (products with a large gap between import and export prices are not supposed to be easy to replace )
df_out <- df_i_N_eu %>%
full_join(df_i_hhi_eu, by="k") %>%
full_join(df_production_indicator, by="k") %>%
full_join(df_price, by="k") %>%
filter(complete.cases(.)) %>%
mutate(rank_N=dense_rank(N_eu_fin),
rank_hhi=dense_rank(desc(hhi_eu_fin)),
rank_production=dense_rank(desc(production_capacity)),
rank_price_diff=dense_rank(desc(price_diff))
) %>%
mutate(rank_N_discrete=ntile(N_eu_fin, n_cat),
rank_hhi_discrete=ntile(desc(hhi_eu_fin), n_cat),
rank_production_discrete=ntile(desc(production_capacity), n_cat),
rank_price_diff_discrete=ntile(desc(price_diff), n_cat)
) %>%
mutate(indicator_diversification=(rank_N+rank_hhi)/2.,
indicator_substitutability=(rank_production+rank_price_diff)/2.,
indicator_diversification_discrete=(rank_N_discrete+rank_hhi_discrete)/2.,
indicator_substitutability_discrete=(rank_production_discrete+
rank_price_diff_discrete)/2.
) %>%
mutate(final_indicator=(indicator_diversification+
indicator_substitutability)/2.,
final_indicator_discrete=(indicator_diversification_discrete+
indicator_substitutability_discrete)/2.
)
save_excel(df_out, "revised_calculations.xlsx")
## df_corr <- df_out %>%
## select(N_eu_fin:production_capacity) %>%
## cor()
df_stat <- df_out %>%
select(description, k, final_indicator, final_indicator_discrete)
df_top_discrete <- df_stat %>%
select(-final_indicator) %>%
arrange(final_indicator_discrete) %>%
slice(1:n_top) %>%
mutate(hs2=substrLeft(k,2),
hs4=substrLeft(k,4))
save_excel(df_top_discrete, "top_products_discrete_ranking.xlsx")
df_top <- df_stat %>%
select(-final_indicator_discrete) %>%
arrange(final_indicator) %>%
slice(1:n_top) %>%
mutate(hs2=substrLeft(k,2),
hs4=substrLeft(k,4))
save_excel(df_top, "top_products_ranking.xlsx")
hs2_top_discrete <- df_top_discrete %>%
pull(hs2) %>%
tabyl() %>%
arrange(desc(n)) %>%
mutate(cumulative=cumsum(percent))
hs2_top <- df_top %>%
pull(hs2) %>%
tabyl() %>%
arrange(desc(n)) %>%
mutate(cumulative=cumsum(percent))
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################
### Now some tests on the calculated quantities
df_N_max <- df_i_N_eu %>%
filter(N_eu_fin==max(N_eu_fin))
k_N_max <- df_N_max %>%
pull(k)
test1 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==k_N_max) %>%
group_by(j) %>%
summarise(n=n(), n2=length(unique(i))) %>%
ungroup
test2 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==k_N_max) %>%
group_by(j) %>%
summarise(val=sum(v, na.rm=T)) %>%
ungroup %>%
mutate(share=val/sum(val))
test12 <- test1 %>%
full_join(y=test2, by="j") %>%
mutate(n_share=n*share) %>%
pull(n_share) %>%
sum()
##############################
df_hhi_min <- df_i_hhi_eu %>%
filter(hhi_eu_fin==min(hhi_eu_fin))
hhi_min <- df_hhi_min %>%
pull(k)
test3 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==hhi_min) %>%
group_by(j) %>%
summarise(hhi=hhi_index(v)) %>%
ungroup
test4 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==hhi_min) %>%
group_by(j) %>%
summarise(val=sum(v, na.rm=T)) %>%
ungroup %>%
mutate(share=val/sum(val))
test34 <- test3 %>%
full_join(test4, by="j") %>%
mutate(hhi_share=hhi*share) %>%
pull(hhi_share) %>%
sum()
########################################################################
########################################################################
########################################################################
########################################################################
########################################################################
df_price_max <- df_price %>%
filter(price_diff==max(price_diff, na.rm=T))
price_max <- df_price_max %>%
pull(k)
test5 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==price_max) %>%
summarise(i_val=sum(v, na.rm=T),
i_qty=sum(q, na.rm=T)) %>%
ungroup %>%
mutate(i_price=i_val/i_qty)
test6 <- df_ini %>%
filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
t==max(t), k==price_max) %>%
summarise(e_val=sum(v, na.rm=T),
e_qty=sum(q, na.rm=T)) %>%
ungroup %>%
mutate(e_price=e_val/e_qty)
test56 <- abs(test6$e_price-test5$i_price)/test5$i_price
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
df_production_max <- df_production %>%
filter(production_capacity==max(production_capacity, na.rm=T))
max_prod <- df_production_max %>%
pull(k)
test7 <- df_ini %>%
filter(i %in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==max_prod) %>%
summarise( intra_eu_e_qty=sum(q, na.rm=T)) %>%
ungroup
test8 <- df_ini %>%
filter(i %!in% codes_eu$country_code, j %in% codes_eu$country_code,
t==max(t), k==max_prod) %>%
summarise( i_qty=sum(q, na.rm=T)) %>%
ungroup
test9 <- df_ini %>%
filter(i %in% codes_eu$country_code, j %!in% codes_eu$country_code,
t==max(t), k==max_prod) %>%
summarise( e_qty=sum(q, na.rm=T)) %>%
ungroup
test789 <- test8$i_qty/(test9$e_qty+test7$intra_eu_e_qty)
print("So far so good")