pubs_old.py.bak



import os
from sys import argv
from collections import Counter
from functools import partial
from multiprocessing import Pool
from itertools import repeat

import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from bs4 import BeautifulSoup

from pubs_article import article

datadir = 'data/'
articledir = datadir + 'articles/'
testdir = datadir + 'testarticles/'
geonamedir = datadir + 'geonames/'
outdir = 'out/'
pubmeddir = 'pubmed/'

cities1000 = 'cities1000.txt'
countryInfo = 'countryInfo.txt'


def build_test_paths():
    articlepaths = get_article_paths(testdir)
    places = get_geoname_dataframe(cities1000)
    countries = get_countryinfo_dataframe(countryInfo)
    return articlepaths, places, countries


def get_article_paths(articledir):
    paths = []
    for (dirpath, dirs, files) in os.walk(articledir):
        for filename in files:
            reldir = os.path.relpath(dirpath)
            relfile = os.path.join(reldir, filename)
            if os.path.splitext(relfile)[1] == '.nxml':
                paths.append(relfile)
    return(paths)


def get_geoname_dataframe(geonamefile):
    colnames = ['geonameid', 'name', 'asciiname', 'alternatenames',
                'latitude', 'longitude', 'featureclass', 'featurecode',
                'countrycode', 'cc2', 'admin1code', 'admin2code',
                'admin3code', 'admin4code', 'population', 'elevation',
                'dem', 'timezone', 'modificationdate']
    dtypes = {'geonameid': 'int64', 'name': 'object',
              'asciiname': 'object', 'alternatenames': 'object',
              'latitude': 'float64', 'longitude': 'float64',
              'featureclass': 'object', 'featurecode': 'object',
              'countrycode': 'object', 'cc2': 'object',
              'admin1code': 'object', 'admin2code': 'object',
              'admin3code': 'object', 'admin4code': 'object',
              'population': 'int64', 'elevation': 'float64',
              'dem': 'int64', 'timezone': 'object',
              'modificationdate': 'object'}
    places = pd.io.parsers.read_table(geonamedir + geonamefile,
                                      header=None, names=colnames,
                                      dtype=dtypes, encoding='utf-8')
    return(places)


def get_countryinfo_dataframe(countryfile):
    countries = pd.io.parsers.read_table(geonamedir + countryfile,
                                         encoding='utf-8')
    return(countries)


def get_single_article_matches(path, places, countries, tag):
    x = article(path)
    x.get_tag_text(tag)
    x.match_countries(countries)
    x.match_places(places)
    print(('Found %i matches.' % len(x.places)))
    return(x.give_dataframe())


def get_all_matches(articlepaths, places, countries, tag):
    allmatches = DataFrame()
    n = len(articlepaths)
    for i, path in enumerate(articlepaths):
        matches = get_single_article_matches(path, places, countries, tag)
        allmatches = allmatches.append(matches, ignore_index=True)
        print(('Matches in article %i of %i: %i.'
              ' Total matches: %i' % (i+1, n, len(matches), len(allmatches))))
    return(allmatches)


def get_all_matches_parallel(articlepaths, places, countries, tag):
    pool = Pool(processes=4)
    allmatches = DataFrame()
    n = len(articlepaths)
    partial_get_single_article_matches = partial(get_single_article_matches,
                                                 places=places,
                                                 countries=countries,
                                                 tag=tag)
    matches_list = pool.map(partial_get_single_article_matches, articlepaths)
    allmatches = pd.concat(matches_list)
    return(allmatches)


def get_all_matches_starmap(articlepaths, places, countries, tag):
    pool = Pool(processes=4)
    allmatches = DataFrame()
    n = len(articlepaths)
    pathzip = list(zip(articlepaths, repeat(places), repeat(countries), repeat(tag)))
    # partial_get_single_article_matches = partial(get_single_article_matches,
    #                                              places=places,
    #                                              countries=countries,
    #                                              tag=tag)
    matches_list = pool.starmap(get_single_article_matches, pathzip)
    allmatches = pd.concat(matches_list)
    return(allmatches)


# def count_all_matches(articlepaths, places, countries, tag):
#     allmatches = get_all_matches(articlepaths, places, countries, tag)
#     counts = Counter(allmatches)
#     counts = Series(counts)
#     counts = DataFrame(counts)
#     counts.columns = ["count"]
#     return(counts)


def main():
    articlepaths, places, countries = build_test_paths()
    tag = 'p'
    allmatches = get_all_matches(articlepaths, places, countries, tag)
    # counts = count_all_matches(articlepaths, places, countries, tag)
    # joined = counts.join(places)
    # matches.to_csv(outdir + 'matched_places.csv', encoding='utf-8')


if __name__ == '__main__':
    main()