initial

martinbenes1996 · May 21, 2020 · a59d656 · a59d656
1 parent 3bf122b
commit a59d656
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,55 @@
+
+# Python Interface to COVID-19 Data Hub
+
+Python package [covid19poland](https://pypi.org/project/covid19poland/) provides access to COVID-19 data of Poland.
+
+The data is scraped from Wikipedia.
+
+## Setup and usage
+
+Install from [pip](https://pypi.org/project/covid19poland/) with
+
+```python
+pip install covid19poland
+```
+
+Importing main `fetch()` function with 
+
+```python
+import covid19poland as PL
+
+x = PL.fetch()
+```
+
+Package is regularly updated. Update with
+
+```bash
+pip install --upgrade covid19poland
+```
+
+## Parametrization
+
+### Level
+
+Level is a setting for granularity of data
+
+1. Country level (default)
+2. State level
+
+```python
+import covid19poland as PL
+
+# country level
+x1 = PL.fetch(level = 1)
+# state level
+x2 = PL.fetch(level = 2)
+```
+
+## Contribution
+
+Developed by [Martin Benes](https://github.com/martinbenes1996).
+
+Join on [GitHub](https://github.com/martinbenes1996/covid19poland).
+
+
+
diff --git a/covid19poland/__init__.py b/covid19poland/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""Webscraper for Wikipedia. Uses archive.org if fails scraping.
+ 
+Reference: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland
+Todo:
+    * caching
+"""
+
+import pkg_resources
+from .main import *
+
+try:
+    __version__ = pkg_resources.get_distribution("covid19_PL_wiki").version
+except:
+    __version__ = None
diff --git a/covid19poland/main.py b/covid19poland/main.py
@@ -0,0 +1,155 @@
+
+
+from datetime import datetime,date,timedelta
+import json
+import re
+import time
+import warnings
+
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+def create_url(dt = None):
+    if dt is not None:
+        # date,datetime OK
+        if isinstance(dt, date) or isinstance(dt, datetime): pass
+        # parse string date
+        elif isinstance(dt, str):
+            try: dt = datetime.strptime("%Y-%m-%d")
+            except: 
+                try: dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
+                except:
+                    try: dt = datetime.strptime(dt, "%Y-%m-%dT%H:%M:%SZ")
+                    except: raise ValueError("unknown format of date")
+        # unsupported date type
+        else: raise TypeError("unknown type of date")
+    # create url
+    urlparam = 'url=https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland'
+    dtparam = f"&timestamp={ dt.strftime('%Y%m%d') }" if dt is not None else ""
+    return f'http://archive.org/wayback/available?{urlparam}{dtparam}'
+
+def fetch_www_json(url, dt = None):
+    if dt is None:
+        return 'https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland',datetime.now()
+    # receive www json
+    body = json.loads(requests.get(url).text)
+    # parse
+    source_url = body['archived_snapshots']['closest']['url']
+    source_dt = body['archived_snapshots']['closest']['timestamp']
+    return source_url,datetime.strptime(source_dt, "%Y%m%d%H%M%S")
+
+def get_previous_dt(dt = None):
+    if dt is None:
+        dt = datetime.now()
+    # recursion condition
+    if dt < datetime(2020,4,30):
+        raise Exception("no earlier page found")
+    # get closest date
+    url = create_url(dt)
+    url_w,dt_w = fetch_www_json(url)
+    # changed
+    if dt > dt_w:
+        return dt_w
+    else:
+        get_previous_dt(dt - timedelta(days=1))
+
+def parse_poland(x):
+    # get table
+    t = pd.read_html(x[0].prettify())[0]
+    # format table
+    t.columns = ["date","suspected","quarantined","monitored","tested","confirmed_daily","confirmed","active","recovered","deaths_official","deaths_unofficial","source"]
+    t = t[:-3].fillna(0).drop("source", axis=1)
+    t["date"] = t["date"].apply(lambda s: datetime.strptime(re.search("^[0-9]+ [a-zA-Z]+ [0-9]+",s).group(0), '%d %B %Y'))
+    for col in ["suspected","quarantined","monitored","tested","confirmed_daily","confirmed","active","recovered","deaths_official","deaths_unofficial"]:
+        t[col] = t[col].apply(lambda s: int(re.search("^-?[0-9]+",str(s)).group(0)))
+
+    return t
+
+states = ["DS","KP","LB","LD","LU","MA","MZ","OP","PD","PK","PM","SK","SL","WN","WP","ZP"]
+def parse_states(x):
+    # get tables
+    t_confirmed = pd.read_html(x[1].prettify())[0]
+    t_deaths = pd.read_html(x[2].prettify())[0]
+    # format tables
+    t_deaths.columns = t_confirmed.columns = ["date",*states,"daily","total","source"]
+    t_confirmed = t_confirmed[:-3].fillna(0).drop(["daily","total","source"], axis=1)
+    t_deaths = t_deaths[:-3].fillna(0).drop(["daily","total","source"], axis=1)
+    # parse tables
+    parse_date = lambda dt: datetime.strptime(re.search("^[0-9]+ [a-zA-Z]+ [0-9]+", dt).group(0), '%d %B %Y')
+    t_confirmed["date"] = t_confirmed["date"].apply(parse_date)
+    t_deaths["date"] = t_deaths["date"].apply(parse_date)
+    # parse numbers
+    parse_int_prefix = lambda i: int(re.search("^-?[0-9]+", str(i)).group(0))
+    for col in states:
+        t_confirmed[col] = t_confirmed[col].apply(parse_int_prefix)
+        t_deaths[col] = t_deaths[col].apply(parse_int_prefix)
+    # wide to long
+    confirmed = t_confirmed.melt(id_vars='date', value_vars=states, var_name="state", value_name="confirmed")
+    deaths = t_deaths.melt(id_vars='date', value_vars=states, var_name="state", value_name="deaths")
+    # join
+    xx = confirmed.merge(deaths, on=["date","state"], how="outer")
+    xx["confirmed"] = xx["confirmed"].fillna(0).apply(parse_int_prefix)
+    xx["deaths"] = xx["deaths"].fillna(0).apply(parse_int_prefix)
+
+    return xx
+
+def fetch_table(table_parser = lambda x: _, dt = None):
+    # create archive url
+    archive_url = create_url(dt)
+    # fetch json
+    try:
+        url,dt = fetch_www_json(archive_url)
+    except:
+        print("error accessing archive")
+        raise
+    # fetch wiki
+    time.sleep(1)
+    try:
+        response = requests.get(url)
+    except:
+        print("error accessing page")
+        raise
+    # parse
+    try:
+        wiki = BeautifulSoup(response.text, features="lxml")
+        tables = wiki.find_all("table", class_="wikitable")
+        t = table_parser(tables)
+    # error
+    except:
+        pass
+    # ok
+    else:
+        return t
+
+    # on error, fetch from archiveanother page
+    try:
+        dt_prev = get_previous_dt(dt)
+    except:
+        print("error fetching previous date")
+        raise
+
+    try:
+        t = fetch_table(table_parser, dt_prev)
+    except:
+        print("error fetching table")
+    return t
+
+def fetch1(dt = None):
+    return fetch_table(parse_poland, dt)
+def fetch2(dt = None):
+    return fetch_table(parse_states, dt)
+
+def fetch(level = 1, dt = None):
+    if level == 1:
+        return fetch_table(parse_poland, dt)
+    elif level == 2:
+        return fetch_table(parse_states, dt)
+    else:
+        warnings.warn("unsupported level")
+        return None
+
+__all__ = ["fetch"]
+
+if __name__ == "__main__":
+    raise NotImplementedError
diff --git a/publish.sh b/publish.sh
@@ -0,0 +1,7 @@
+
+# remove previous releases
+rm -rf build/ dist/ covid19dh.egg-info/ __pycache__/
+# compile
+python setup.py sdist bdist_wheel
+# publish
+python -m twine upload dist/*
diff --git a/setup.py b/setup.py
@@ -0,0 +1,37 @@
+
+import setuptools
+with open("README.md", "r", encoding="UTF-8") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+  name = 'covid19poland',
+  version = '0.0.1',
+  author = 'Martin Beneš',
+  author_email = '[email protected]',
+  description = 'Web Scraper for Poland COVID19 data.',
+  long_description = long_description,
+  long_description_content_type="text/markdown",
+  packages=setuptools.find_packages(),
+  license='GPL',
+  url = 'https://www.covid19datahub.io/',
+  download_url = 'https://github.com/martinbenes1996/covid19poland/archive/0.0.1.tar.gz',
+  keywords = ['2019-nCov', 'poland', 'coronavirus', 'covid-19', 'covid-data', 'covid19-data'],
+  install_requires=[],
+  package_dir={'': '.'},
+  classifiers=[
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Science/Research',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Other Audience',
+    'Topic :: Database',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Information Analysis',
+    'Topic :: Software Development :: Libraries',
+    'Topic :: Utilities',
+    'License :: OSI Approved :: GNU General Public License (GPL)',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.4',
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+  ],
+)