From 620477ab3f41008a4968f28d26bf818871a870c9 Mon Sep 17 00:00:00 2001 From: TerenceLiu Date: Fri, 28 Jan 2022 22:52:23 +0800 Subject: [PATCH] add NBSCDdata --- CEDA/economic/NBSCData.py | 125 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 CEDA/economic/NBSCData.py diff --git a/CEDA/economic/NBSCData.py b/CEDA/economic/NBSCData.py new file mode 100644 index 0000000..c6d9fd5 --- /dev/null +++ b/CEDA/economic/NBSCData.py @@ -0,0 +1,125 @@ +""" +NBSC's api information: + +url: "https://data.stats.gov.cn/english/easyquery.htm" +params: id=zb&dbcode=hgnd&wdcode=ab&m=getTree +""" + +import os +import time +import pickle +import random +import requests +import pandas as pd +from tqdm import tqdm +from fake_useragent import UserAgent +from requests.packages.urllib3.exceptions import InsecureRequestWarning +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) + + +class NBSCData(object): + def __init__(self, language:str="en"): + + self.dbcode = [] + self.nid = [] + self.pid = [] + self.name = [] + self.wdcode= [] + + if language == "cn": + self.url = "https://data.stats.gov.cn/easyquery.htm" + self.BASE_DIR = os.path.dirname(__file__) + self.__TREE_PATH__ = os.path.join(self.BASE_DIR, "NBSCTree", "data.pkl") + elif language == "en": + self.url = "https://data.stats.gov.cn/english/easyquery.htm" + self.BASE_DIR = os.path.dirname(__file__) + self.__TREE_PATH__ = os.path.join(self.BASE_DIR, "NBSCTree", "data_en.pkl") + + def generate_header(self): + ua = UserAgent() + header = {'User-Agent':str(ua.chrome)} + return header + + + def tree_generation(self, rid="zb", mode:str="fast"): + """ + inspired by a blog: https://www.cnblogs.com/wang_yb/p/14636575.html + """ + parent = [] + r = requests.post("{}?id={}&dbcode=hgnd&wdcode=zb&m=getTree".format(self.url, rid), headers=self.generate_header(), verify=False) + data = r.json() + + for i in range(0, len(data)): + node = data[i] + print("[+] Downloading {} ...".format(node["name"])) + if node["isParent"]: + parent.append(node["id"]) + node["children"] = self.tree_generation(rid=node["id"]) + if i % 100 == 0: + print("[-] Due to the web scraping policy, sleep for 2 seconds") + time.sleep(1) + if mode == "slow": + if i % 1000 == 0: + print("[-] Due to the web scraping policy, sleep for 10 seconds") + time.sleep(10) + + return data + + + def toc(self, nodes): + """ + inspired by a blog: https://www.cnblogs.com/wang_yb/p/14636575.html + """ + for i in range(0, len(nodes)): + node = nodes[i] + if node["isParent"]: + self.toc(node["children"]) + else: + self.dbcode.append(node["dbcode"]) + self.nid.append(node["id"]) + self.name.append(node["name"]) + self.pid.append(node["pid"]) + self.wdcode.append(node["wdcode"]) + + data = pd.DataFrame({"dbcode":self.dbcode, "nid":self.nid, + "name":self.name, "pid":self.pid, "wdcode":self.wdcode}) + return data + + def download_data(self, nid:str=None, sj="1978-"): + params = { + "dbcode": "hgnd", + "rowcode": "zb", + "m": "QueryData", + "colcode": "sj", + "wds": "[]", + "dfwds": '[{"wdcode":"zb","valuecode":"' + + nid + + '"},{"wdcode":"sj","valuecode":"' + + sj + + '"}]', + } + r = requests.get(self.url, params=params, verify=False, headers=self.generate_header()) + if r.ok: + data = r.json()["returndata"]["datanodes"] + date, value = [], [] + for i in range(0, len(data)): + date.append(data[i]["wds"][1]["valuecode"]) + value.append(data[i]["data"]["data"]) + + output = pd.DataFrame({"date":date, "value":value}) + return output + +if __name__ == "__main__": + nbsc = NBSCData(language="en") + nodes = nbsc.tree_generation() + toc = nbsc.toc(nodes=nodes) + toc[toc["name"].str.contains("GDP")] + data = nbsc.download_data(nid="A0203") + + + + + + + + \ No newline at end of file