95 lines
3.2 KiB
Python
95 lines
3.2 KiB
Python
import re
|
|
import requests
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from fake_useragent import UserAgent
|
|
|
|
class EurostatData(object):
|
|
|
|
"""
|
|
for more information: https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=BulkDownload_Guidelines.pdf
|
|
"""
|
|
|
|
def __init__(self, language:str="en"):
|
|
self.language = language
|
|
self.url = "https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/"
|
|
self.toc_url = "https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents_{}.txt".format(language)
|
|
|
|
def toc(self) -> pd.DataFrame:
|
|
"""
|
|
the return value includes 8 columns:
|
|
'title'
|
|
'code'
|
|
'type'
|
|
'last update of data'
|
|
'last table structure change'
|
|
'data start'
|
|
'data end'
|
|
'values'
|
|
"""
|
|
toc = pd.read_csv(self.toc_url, sep="\t")
|
|
return toc
|
|
|
|
def search_toc(self, query:str=None):
|
|
"""
|
|
fuzzy search in the "title"
|
|
"""
|
|
toc = self.toc()
|
|
if query == None:
|
|
return ValueError("rex is invalid.")
|
|
else:
|
|
return toc[toc["title"].str.contains(query)].reset_index(drop=True)
|
|
|
|
def download_data(self, datasetcode:str=None, geo:str=None, unit:str=None):
|
|
url = self.url + "BulkDownloadListing?sort=1&file=data%2F" + datasetcode + ".tsv.gz"
|
|
data = pd.read_csv(url, sep = "\t", compression="gzip")
|
|
data = data.drop(data.columns[0], axis=1).join(data[data.columns[0]].str.split(",", expand=True))
|
|
columns_list = list(data.columns)[:-3] + ["unit", "na_item", "geo"]
|
|
data.columns = columns_list
|
|
columns_list = columns_list[-3:] + columns_list[:-3]
|
|
data = data[columns_list]
|
|
if geo != None and unit != None:
|
|
data = data.loc[(data["geo"] == geo) & (data["unit"] == unit)]
|
|
for i in range(4, len(list(data.columns))):
|
|
data[data.columns[i]] = data[data.columns[i]].str.extract(r'(\d+.\d+)').astype("float")
|
|
return data
|
|
|
|
elif geo != None and unit == None:
|
|
data = data.loc[(data["geo"] == geo)]
|
|
for i in range(4, len(list(data.columns))):
|
|
data[data.columns[i]] = data[data.columns[i]].str.extract(r'(\d+.\d+)').astype("float")
|
|
return data
|
|
|
|
elif geo == None and unit != None:
|
|
data = data.loc[(data["geo"] == geo)]
|
|
for i in range(4, len(list(data.columns))):
|
|
data[data.columns[i]] = data[data.columns[i]].str.extract(r'(\d+.\d+)').astype("float")
|
|
return data
|
|
|
|
elif geo == None and unit == None:
|
|
for i in range(4, len(list(data.columns))):
|
|
data[data.columns[i]] = data[data.columns[i]].str.extract(r'(\d+.\d+)').astype("float")
|
|
return data
|
|
|
|
def download_dic(self, category:str=None):
|
|
url = self.url + "BulkDownloadListing?sort=1&file=dic%2F{}".format(self.language) + "%2F" + category + "dic"
|
|
return pd.read_csv(url, sep="\t")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
eu = EurostatData(language="en", version=2.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|