Skip to content

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
martinbenes1996 committed May 21, 2020
1 parent 3bf122b commit a59d656
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 0 deletions.
55 changes: 55 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

# Python Interface to COVID-19 Data Hub

Python package [covid19poland](https://pypi.org/project/covid19poland/) provides access to COVID-19 data of Poland.

The data is scraped from Wikipedia.

## Setup and usage

Install from [pip](https://pypi.org/project/covid19poland/) with

```python
pip install covid19poland
```

Importing main `fetch()` function with

```python
import covid19poland as PL

x = PL.fetch()
```

Package is regularly updated. Update with

```bash
pip install --upgrade covid19poland
```

## Parametrization

### Level

Level is a setting for granularity of data

1. Country level (default)
2. State level

```python
import covid19poland as PL

# country level
x1 = PL.fetch(level = 1)
# state level
x2 = PL.fetch(level = 2)
```

## Contribution

Developed by [Martin Benes](https://github.com/martinbenes1996).

Join on [GitHub](https://github.com/martinbenes1996/covid19poland).



15 changes: 15 additions & 0 deletions covid19poland/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
"""Webscraper for Wikipedia. Uses archive.org if fails scraping.
Reference: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland
Todo:
* caching
"""

import pkg_resources
from .main import *

try:
__version__ = pkg_resources.get_distribution("covid19_PL_wiki").version
except:
__version__ = None
155 changes: 155 additions & 0 deletions covid19poland/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@


from datetime import datetime,date,timedelta
import json
import re
import time
import warnings

from bs4 import BeautifulSoup
import pandas as pd
import requests

def create_url(dt = None):
if dt is not None:
# date,datetime OK
if isinstance(dt, date) or isinstance(dt, datetime): pass
# parse string date
elif isinstance(dt, str):
try: dt = datetime.strptime("%Y-%m-%d")
except:
try: dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
except:
try: dt = datetime.strptime(dt, "%Y-%m-%dT%H:%M:%SZ")
except: raise ValueError("unknown format of date")
# unsupported date type
else: raise TypeError("unknown type of date")
# create url
urlparam = 'url=https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland'
dtparam = f"&timestamp={ dt.strftime('%Y%m%d') }" if dt is not None else ""
return f'http://archive.org/wayback/available?{urlparam}{dtparam}'

def fetch_www_json(url, dt = None):
if dt is None:
return 'https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Poland',datetime.now()
# receive www json
body = json.loads(requests.get(url).text)
# parse
source_url = body['archived_snapshots']['closest']['url']
source_dt = body['archived_snapshots']['closest']['timestamp']
return source_url,datetime.strptime(source_dt, "%Y%m%d%H%M%S")

def get_previous_dt(dt = None):
if dt is None:
dt = datetime.now()
# recursion condition
if dt < datetime(2020,4,30):
raise Exception("no earlier page found")
# get closest date
url = create_url(dt)
url_w,dt_w = fetch_www_json(url)
# changed
if dt > dt_w:
return dt_w
else:
get_previous_dt(dt - timedelta(days=1))

def parse_poland(x):
# get table
t = pd.read_html(x[0].prettify())[0]
# format table
t.columns = ["date","suspected","quarantined","monitored","tested","confirmed_daily","confirmed","active","recovered","deaths_official","deaths_unofficial","source"]
t = t[:-3].fillna(0).drop("source", axis=1)
t["date"] = t["date"].apply(lambda s: datetime.strptime(re.search("^[0-9]+ [a-zA-Z]+ [0-9]+",s).group(0), '%d %B %Y'))
for col in ["suspected","quarantined","monitored","tested","confirmed_daily","confirmed","active","recovered","deaths_official","deaths_unofficial"]:
t[col] = t[col].apply(lambda s: int(re.search("^-?[0-9]+",str(s)).group(0)))

return t

states = ["DS","KP","LB","LD","LU","MA","MZ","OP","PD","PK","PM","SK","SL","WN","WP","ZP"]
def parse_states(x):
# get tables
t_confirmed = pd.read_html(x[1].prettify())[0]
t_deaths = pd.read_html(x[2].prettify())[0]
# format tables
t_deaths.columns = t_confirmed.columns = ["date",*states,"daily","total","source"]
t_confirmed = t_confirmed[:-3].fillna(0).drop(["daily","total","source"], axis=1)
t_deaths = t_deaths[:-3].fillna(0).drop(["daily","total","source"], axis=1)
# parse tables
parse_date = lambda dt: datetime.strptime(re.search("^[0-9]+ [a-zA-Z]+ [0-9]+", dt).group(0), '%d %B %Y')
t_confirmed["date"] = t_confirmed["date"].apply(parse_date)
t_deaths["date"] = t_deaths["date"].apply(parse_date)
# parse numbers
parse_int_prefix = lambda i: int(re.search("^-?[0-9]+", str(i)).group(0))
for col in states:
t_confirmed[col] = t_confirmed[col].apply(parse_int_prefix)
t_deaths[col] = t_deaths[col].apply(parse_int_prefix)
# wide to long
confirmed = t_confirmed.melt(id_vars='date', value_vars=states, var_name="state", value_name="confirmed")
deaths = t_deaths.melt(id_vars='date', value_vars=states, var_name="state", value_name="deaths")
# join
xx = confirmed.merge(deaths, on=["date","state"], how="outer")
xx["confirmed"] = xx["confirmed"].fillna(0).apply(parse_int_prefix)
xx["deaths"] = xx["deaths"].fillna(0).apply(parse_int_prefix)

return xx

def fetch_table(table_parser = lambda x: _, dt = None):
# create archive url
archive_url = create_url(dt)
# fetch json
try:
url,dt = fetch_www_json(archive_url)
except:
print("error accessing archive")
raise
# fetch wiki
time.sleep(1)
try:
response = requests.get(url)
except:
print("error accessing page")
raise
# parse
try:
wiki = BeautifulSoup(response.text, features="lxml")
tables = wiki.find_all("table", class_="wikitable")
t = table_parser(tables)
# error
except:
pass
# ok
else:
return t

# on error, fetch from archiveanother page
try:
dt_prev = get_previous_dt(dt)
except:
print("error fetching previous date")
raise

try:
t = fetch_table(table_parser, dt_prev)
except:
print("error fetching table")
return t

def fetch1(dt = None):
return fetch_table(parse_poland, dt)
def fetch2(dt = None):
return fetch_table(parse_states, dt)

def fetch(level = 1, dt = None):
if level == 1:
return fetch_table(parse_poland, dt)
elif level == 2:
return fetch_table(parse_states, dt)
else:
warnings.warn("unsupported level")
return None

__all__ = ["fetch"]

if __name__ == "__main__":
raise NotImplementedError
7 changes: 7 additions & 0 deletions publish.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

# remove previous releases
rm -rf build/ dist/ covid19dh.egg-info/ __pycache__/
# compile
python setup.py sdist bdist_wheel
# publish
python -m twine upload dist/*
37 changes: 37 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

import setuptools
with open("README.md", "r", encoding="UTF-8") as fh:
long_description = fh.read()

setuptools.setup(
name = 'covid19poland',
version = '0.0.1',
author = 'Martin Beneš',
author_email = '[email protected]',
description = 'Web Scraper for Poland COVID19 data.',
long_description = long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
license='GPL',
url = 'https://www.covid19datahub.io/',
download_url = 'https://github.com/martinbenes1996/covid19poland/archive/0.0.1.tar.gz',
keywords = ['2019-nCov', 'poland', 'coronavirus', 'covid-19', 'covid-data', 'covid19-data'],
install_requires=[],
package_dir={'': '.'},
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Science/Research',
'Intended Audience :: Developers',
'Intended Audience :: Other Audience',
'Topic :: Database',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Software Development :: Libraries',
'Topic :: Utilities',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
)

0 comments on commit a59d656

Please sign in to comment.