CEDApy/CEDA/economic/EPU.py

86 lines
3.6 KiB
Python
Raw Normal View History

2022-01-27 04:00:23 +00:00
from lxml import html
import requests
import pandas as pd
url = {
2023-04-20 12:24:33 +00:00
"EPU-China": "https://www.policyuncertainty.com/media/SCMP_China_Policy_Uncertainty_Data.xlsx",
"EPU-HKSAR": "https://www.policyuncertainty.com/media/HK_EPU_Data_Annotated.xlsx",
2022-01-27 04:00:23 +00:00
"EPU": "https://www.policyuncertainty.com/"
}
2022-01-29 01:40:37 +00:00
def country_list():
country_list = ["Global", "USA", "Australia", "Belgium",
"Brazil", "Canada", "Chile", "China",
"Colombia", "Croatia", "Denmark", "France",
"Germany", "Greece", "HKSAR", "MACAUSAR",
"India", "Ireland", "Italy", "Japan",
"Korea", "Mexico", "Netherlands", "Pakistan",
"Russia", "Singapore", "Spain", "Sweden", "UK"]
annotations = "Disambiguation: the word 'Korea' in here stands for 'South Korea'"
return country_list, annotations
2022-01-27 04:00:23 +00:00
class EPUData(object):
def __init__(self, country:str=None):
self.country = country
def download(self):
if self.country == "China":
r = requests.get(url["EPU-China"])
webpage = html.fromstring(r.content)
urls = pd.Series(webpage.xpath("//a/@href"))
urls_data = urls[urls.str.contains("xlsx")]
urls_cite = urls[urls.str.contains("pdf")]
urls_data = [url["EPU-China"] + i for i in urls_data]
urls_cite = [url["EPU-China"] + i for i in urls_cite]
output_data = []
for i in range(0, len(urls_data)):
output_data.append(pd.read_excel(urls_data[i]))
return {"data":output_data, "reference":urls_cite}
elif self.country == "HKSAR":
r = requests.get(url["EPU-HKSAR"])
webpage = html.fromstring(r.content)
urls = pd.Series(webpage.xpath("//a/@href"))
urls_data = urls[urls.str.contains("xlsx")]
urls_cite = urls[urls.str.contains("pdf")]
urls_data = [url["EPU-China"] + i for i in urls_data]
urls_cite = [url["EPU-China"] + i for i in urls_cite]
output_data = []
for i in range(0, len(urls_data)):
output_data.append(pd.read_excel(urls_data[i]))
return {"data":output_data, "reference":urls_cite}
2023-04-20 12:24:33 +00:00
'''
2022-01-27 04:00:23 +00:00
elif self.country == "MACAUSAR":
r = requests.get(url["EPU-MACAUSAR"])
webpage = html.fromstring(r.content)
urls = pd.Series(webpage.xpath("//a/@href"))
urls_data = urls[urls.str.contains("xlsx")]
urls_cite = urls[urls.str.contains("pdf")]
urls_data = [url["EPU-China"] + i for i in urls_data]
urls_cite = [url["EPU-China"] + i for i in urls_cite]
output_data = []
for i in range(0, len(urls_data)):
output_data.append(pd.read_excel(urls_data[i]))
return {"data":output_data, "reference":urls_cite}
2023-04-20 12:24:33 +00:00
'''
2022-01-27 04:00:23 +00:00
else:
r = requests.get(url["EPU"] + self.country.lower() + "_monthly.html")
webpage = html.fromstring(r.content)
urls = pd.Series(webpage.xpath("//a/@href"))
urls_data = urls[urls.str.contains("xlsx")]
urls_cite = urls[urls.str.contains("pdf")]
urls_data = [url["EPU"] + i for i in urls_data]
urls_cite = [url["EPU"] + i for i in urls_cite]
output_data = []
for i in range(0, len(urls_data)):
2022-02-03 08:00:07 +00:00
try:
tmp_data = pd.read_excel(urls_data[i])
output_data.append(tmp_data)
except Exception as e:
pass
2022-01-27 04:00:23 +00:00
return {"data":output_data, "reference":urls_cite}