diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..69478e4 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,42 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "dev" ] + pull_request: + branches: [ "dev" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8","3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m spacy download fi_core_news_sm + python -m pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a4d7943 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +# Ignore Python cache files +__pycache__/ + +# Ignore system and editor files +.DS_Store +*.swp +*.swo +*.bak +*~ diff --git a/finger/__init__.py b/finger/__init__.py deleted file mode 100644 index d2cc938..0000000 --- a/finger/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# __init__.py - -#from finger.location_coder import * -#from finger.location_tagger import * -#from finger.output_formatter import * diff --git a/finger/geoparser.py b/finger/geoparser.py deleted file mode 100644 index 5062437..0000000 --- a/finger/geoparser.py +++ /dev/null @@ -1,148 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 24 18:55:46 2021 - -@author: Tatu Leppämäki -""" - - -from finger.location_tagger import location_tagger -from finger.location_coder import location_coder -from finger.output_formatter import create_eupeg_json - -import time - -class geoparser: - """ - The geoparser handles a whole geoparsing pipeline from geotagging to geocoding. - It accepts a list of Finnish text strings as input. It then runs those texts - through a BERT-based neural linguistic and NER analysis pipeline built on Spacy. - The objective of this analysis is to find references to locations, such as - countries, towns, remarkable places etc., although the pipeline also runs general - named entity recognition and things like dependency parsing and part-of-speech tagging - on the side. Each input sentence can have zero to n locations in them. The locations are - lemmatized using the Voikko library. The first part of the geoparsing process is called (geo)tagging. - - The tagger results are gathered on a Pandas dataframe consisting of five columns, - with each analysis of a sentence on a single row. The dataframe is passed to - the (geo)coder, which attempts to return coordinate representations of the locations. - Currently, it relies on the GeoNames gazetteer, which is queried using a Python - module called GeoCoder. If locations are found, coordinate point representations - of them are returned as tuples or as Shapely points in WGS84 (EPSG:4326) CRS. - - """ - - def __init__(self, pipeline_path="fi_geoparser", use_gpu=False, - output_df=True, gn_username="", verbose=True): - """ - Parameters: - pipeline_path | String: name of the Spacy pipeline, which is called with spacy.load(). - "fi_geoparser", which is the installation name, by default, - however, a path to the files can also be provided. - - - use_gpu | Boolean: Whether the pipeline is run on the GPU (significantly faster, but often missing in - e.g. laptops) or CPU (slower but should run every time). Default True. - - output_df | Boolean: If True, the output will be a Pandas DataFrame. False does nothing currently. - - gn_username | String: GeoNames API key, or username, which is used for geocoding. - Mandatory, get from https://www.geonames.org/ - - verbose | Boolean: Prints progress reports. Default True. - - - """ - - self.tagger = location_tagger(pipeline_path, use_gpu=use_gpu) - - self.coder = location_coder(gn_username=gn_username) - - self.verbose=verbose - - - def geoparse(self, texts, ids=None, explode_df=False, return_shapely_points=False, - drop_non_locations=False, output='all', filter_toponyms=True, entity_tags=['LOC']): - """ - The whole geoparsing pipeline. - - Input: - texts | A string or a list of input strings: The input text(s) - *ids | String, int, float or a list: Identifying element of each input, e.g. tweet id. Must be - the same length as texts - *explode_df | Boolean: Whether to have each location "hit" on separate rows in the output. Default False - *return_shapely_points | Boolean: Whether the coordinate points of the locations are - regular tuples or Shapely points. Default False. - *drop_non_locations | Boolean: Whether the sentences where no locations were found are - included in the output. Default False (non-locs are included). - *output | String: What's included in the output and in what format it is. - Possible values: - 1. 'all': All columns listed below as a dataframe - TODO 2. 'essential': Dataframe trimmed down selection of columns - 3. 'eupeg': - *filter_toponyms | Boolean: Whether to filter out almost certain false positive toponyms. - Currently removes toponyms with length less than 2. Default True. - - Output columns: - Pandas Dataframe containing columns: - 1. input_text: the input sentence | String - 2. doc: Spacy doc object of the sent analysis. See https://spacy.io/api/doc | Doc - 3. locations_found: Whether locations were found in the input sent | Bool - 4. locations: locations in the input text, if found | list of strings or None - 5. loc_lemmas: lemmatized versions of the locations | list of strings or None - 6. loc_spans: the index of the start and end characters of the identified - locations in the input text string | tuple - 7. input_order: the index of the inserted texts. i.e. the first text is 0, the second 1 etc. - Makes it easier to reassemble the results if they're exploded | int' - 8. names: versions of the names returned by querying GeoNames | List of strins or None - 9. coord_points: long/lat coordinate points in WGS84 | list of long/lat tuples or Shapely points - 10.*id: The identifying element tied to each input text, if provided | string, int, float - OR - EUPEG (see here: https://github.com/geoai-lab/EUPEG) style json dump, with restucturing data and renaming headers to be in line. - Mostly meant for evaluation purposes. This option only allows one text to be processed at once (no batch processing). - - """ - assert texts, "Input missing. Expecting a (list of) strings." - - # fix if someone passes just a string - if isinstance(texts, str): - texts = [texts] - - if output.lower() == 'eupeg': - explode_df = True - - # check that ids are in proper formats and lengths - if ids: - if isinstance(ids, (str, int, float)): - ids = [ids] - assert len(texts) == len(ids), "If ids are passed, the number of ids and texts must be equal." - - - if self.verbose: - print("Starting geotagging...") - t = time.time() - - # TOPONYM RECOGNITION - tag_results = self.tagger.tag_sentences(texts, ids, explode_df=explode_df, - drop_non_locs=drop_non_locations, - filter_toponyms=filter_toponyms, - entity_tags=entity_tags) - - if self.verbose: - successfuls = tag_results['locations_found'].tolist() - print("Finished geotagging after", round(time.time()-t, 2),"s.", successfuls.count(True), "location hits found.") - print("Starting geocoding...") - - # TOPONYM RESOLVING - geocode_results = self.coder.geocode_batch(tag_results, shp_points=False, - exploded=explode_df) - - - if self.verbose: - print("Finished geocoding, returning output.") - print("Total elapsed time:", round(time.time()-t, 2),"s") - - if output.lower() == 'eupeg': - return create_eupeg_json(geocode_results) - else: - return geocode_results diff --git a/finger/location_coder.py b/finger/location_coder.py deleted file mode 100644 index db1965a..0000000 --- a/finger/location_coder.py +++ /dev/null @@ -1,143 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 24 18:53:37 2021 - -@author: Tatu Leppämäki -""" -import geocoder.geonames as gn -#import pandas as pd -""" -try: - from shapely.geometry import Point -except (ImportError, FileNotFoundError) as e: - print("Unable to import Shapely. The geoparser works, but exporting to Shapely points is unavailable.") -""" -class location_coder: - - def __init__(self, gn_username=""): - """ - A geocoder, which currently accepts a Pandas dataframe (must be of certain - format, which mostly makes this usable as part of a geoparser pipeline) - and outputs a dataframe. The following columns are appended to the input df: - - 1. gn_names: versions of the locations returned by querying GeoNames | List of strings or None - 2. gn_points: long/lat coordinate points in WGS84 | list of long/lat tuples or Shapely points - - TO RUN THIS GEOCODER, YOU CURRENTLY NEED A GEONAMES API KEY. The API key - can be acquired simply by creating an account in https://www.geonames.org/ - Pass your account name as gn_username parameter. - """ - - self.username_count = 0 - - if isinstance(gn_username, (list, tuple, set)): - self.username=gn_username[self.username_count] - self.username_list = gn_username - #self.username_list_flag = True - else: - self.username=gn_username - self.username_list = [gn_username] - #self.username_list_flag = False - - self.username=gn_username - - assert self.username, "GeoNames API key (username) must be provided for the geocoder." - - test_result = gn("London", key=self.username) - assert test_result.ok, "Geocoding failed. Did you enter a valid GeoNames API key?" - - def geocode_batch(self, locations, input_type="df", shp_points=False, exploded=False): - """ - Applies geocoding to the lemmatized locations in the input dataframe. - """ - - - locations['names'] = None - locations['coord_points'] = None - - self.shp_points = shp_points - - self.exploded = exploded - - self.geocoded_count = 0 - - self.username_count = 0 - - - locations = locations.apply(self.geocode_set, axis=1) - - return locations - - - def geocode_set(self, row): - """ - Geocodes input Pandas series (rows). - """ - - # if locs present, continue. otherwise do nothing - if row['locations_found']: - loc_coord_points = [] - loc_names = [] - - # fixes the problem of the next step expecting a list as an input - if self.exploded: - lemma = row['loc_lemmas'] - lemma_list = [] - lemma_list.append(lemma) - row['loc_lemmas'] = lemma_list - - - for loc in row['loc_lemmas']: - #query geonames - gn_result = gn(loc, key=self.username) - # for every query, add one to the count - self.geocoded_count += 1 - # if succesful, add the name of the place in GN and coordinates - if gn_result.ok: - loc_coord_points.append(self.form_point(gn_result)) - loc_names.append(gn_result.address) - else: - loc_coord_points.append(None) - loc_names.append(None) - - # if no error present, continue as normal - if isinstance(gn_result.error, int): - pass - # if the system throws an error, switch the username or warn the user - elif ("the hourly limit of 1000 credits") in gn_result.error: - switched = self.switch_username() - if switched: - print("\nUsername switched to username no.", str(self.username_count+1), "\n") - else: - print("\nHourly rate limit exceeded and no more GN usernames left. Rest of queries will fail.\n") - self.geocoded_count = 0 - - if all(place==None for place in loc_names): - loc_coord_points = None - loc_names = None - - row['names'] = loc_names - row['coord_points'] = loc_coord_points - - # if count nears 1000, i.e. the hourly rate limit of a GN account - # is filling, try to switch the account - - - return row - else: - return row - - def switch_username(self): - # if there are unused usernames left on the list, - if self.username_count+1 < len(self.username_list): - self.username_count += 1 - self.username=self.username_list[self.username_count] - return True - else: - return False - - def form_point(self, gn_result): - if self.shp_points: - return Point(float(gn_result.lng), float(gn_result.lat)) - else: - return (gn_result.lng, gn_result.lat) diff --git a/finger/location_tagger.py b/finger/location_tagger.py deleted file mode 100644 index d8facca..0000000 --- a/finger/location_tagger.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 24 18:51:53 2021 - -@author: Tatu Leppämäki -""" - - -import spacy - - -class location_tagger: - """ - This class initiates a Finnish NER tagger using Spacy. - - A NER tagger object can be used to tag location mentions in input texts. It accepts list of strings - as input and outputs a Pandas dataframe, which is then passed on to the geocoder. - - Parameters: - pipeline_path | String: name of the Spacy pipeline, which is called with spacy.load(). - "fi_geoparser", which is the installation name, by default, - however, a path to the files can also be provided. - - - use_gpu | Boolean: Whether the pipeline is run on the GPU (significantly faster, but often missing in - e.g. laptops) or CPU (slower but should run every time) - - output_df | Boolean: If True, the output will be a Pandas DataFrame. If False, a dictionary. - Currently, False does nothing. Left in if newer versions implement something - other than Pandas (just nested dictionaries?) - """ - - - def __init__(self, pipeline_path="fi_geoparser", use_gpu=True, - output_df=True): - if use_gpu: - spacy.require_gpu() - else: - spacy.require_cpu() - - self.output_df = output_df - - self.ner_pipeline = spacy.load(pipeline_path) - - - - - - def tag_sentences(self, input_texts, ids, explode_df=False, drop_non_locs=False, - filter_toponyms = True, entity_tags=['LOC'] - ): - """Input: - texts | A string or a list of input strings: The input - *ids | String, int, float or a list: Identifying element of each input, e.g. tweet id. Must be - the same length as texts - *explode_df | Boolean: Whether to have each location "hit" on separate rows in the output. Default False - *drop_non_locations | Boolean: Whether the sentences where no locations were found are - included in the output. Default False (locs are included). - *filter_toponyms | Boolean: Whether to filter out almost certain false positive toponyms. - Currently removes toponyms with length less than 2. Default True. - - Output: Pandas DF containing columns: - 1. input_text: the input sentence | String - 2. doc: Spacy doc object of the sent analysis. See https://spacy.io/api/doc | Doc - 3. locations_found: Whether locations were found in the input sent | Bool - 4. locations: locations in the input text, if found | list of strings or None - 5. loc_lemmas: lemmatized versions of the locations | list of strings or None - 6. loc_spans: the index of the start and end characters of the identified - locations in the input text string | tuple - 7. input_order: the index of the inserted texts. i.e. the first text is 0, the second 1 etc. - Makes it easier to reassemble the results if they're exploded | int' - *8. id: The identifying element tied to each input text, if provided | string, int, float - """ - assert input_texts, "No input provided. Make sure to input a list of strings." - tagged_sentences = [] - - self.explode_df = explode_df - - self.drop_non_locs = drop_non_locs - - self.filter_toponyms = filter_toponyms - - self.entity_tags = entity_tags - - # loop input sentences, gather the tagged dictionary results to a list - for sent in input_texts: - tag_results = self.tag_sentence(sent) - tagged_sentences.append(tag_results) - - return self.to_dataframe(tagged_sentences, ids) - """ - if self.output_df: - return self.to_dataframe(tagged_sentences) - else: - return tagged_sentences - """ - - def tag_sentence(self, sent): - """Input: a sentence to tag (string) - Output: a dictionary with the same variables as listed in 'tag_sentences'""" - doc = self.ner_pipeline(sent) - - # if the tagger created an output, i.e. at least one of the words in the input - # was tagged, create an output of that. Otherwise, return a mostly empty dict - docs = [] - locs = [] - loc_lemmas = [] - loc_spans = [] - locations_found = False - - if doc: - # gather the NER labels found to a list - labels = [ent.label_ for ent in doc.ents] - - locs = [] - # looping through the entities, collecting required information - # namely, the raw toponym text, its lemmatized form and the span as tuple - for ent in doc.ents: - if ent.label_ in self.entity_tags: - # apply filtering if requested - if self.filter_toponyms: - # length filtering - if len(ent.text)>1: - locs.append(ent.text) - loc_lemmas.append(ent.lemma_.replace("#","")) - loc_spans.append((ent.start_char, ent.end_char)) - locations_found = True - else: - locs.append(ent.text) - loc_lemmas.append(ent.lemma_.replace("#","")) - loc_spans.append((ent.start_char, ent.end_char)) - locations_found = True - docs.append(doc) - - if locations_found: - sent_results = {'input_text': sent, 'doc': doc, 'locations_found': locations_found, - 'locations': locs, 'loc_lemmas': loc_lemmas, 'loc_spans': loc_spans} - else: - sent_results = {'input_text': sent, 'doc': doc, 'locations': None, 'loc_lemmas': None, - 'loc_spans': None, 'locations_found': locations_found} - - return sent_results - - def to_dataframe(self, results, ids): - import pandas as pd - - df = pd.DataFrame(results) - - - if ids: - df['id'] = ids - - df['input_order'] = df.index - - # split the possible list contents into multiple rows - if self.explode_df: - df = df.apply(lambda x: x.explode() if x.name in ['locations', 'loc_lemmas', 'loc_spans'] else x) - if self.drop_non_locs: - return self.drop_non_locations(df) - else: - return df - - def drop_non_locations(self, df): - df = df[df['locations_found']] - return df \ No newline at end of file diff --git a/fingerGeoparser/__init__.py b/fingerGeoparser/__init__.py new file mode 100644 index 0000000..64b18de --- /dev/null +++ b/fingerGeoparser/__init__.py @@ -0,0 +1,5 @@ +# __init__.py + +#from finger.toponym_coder import * +#from finger.toponym_tagger import * +#from finger.output_formatter import * diff --git a/fingerGeoparser/geoparser.py b/fingerGeoparser/geoparser.py new file mode 100644 index 0000000..68f4377 --- /dev/null +++ b/fingerGeoparser/geoparser.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 24 18:55:46 2021 + +@author: Tatu Leppämäki + +""" + + +from fingerGeoparser.toponym_tagger import toponym_tagger +from fingerGeoparser.toponym_coder import toponym_coder +from fingerGeoparser.output_formatter import create_eupeg_json + + +import time, asyncio, pandas as pd + +class geoparser: + """ + The geoparser handles a whole geoparsing pipeline from geotagging to geocoding. + It accepts a list of Finnish text strings as input. It then runs those texts + through a BERT-based neural linguistic and NER analysis pipeline built on Spacy. + The objective of this analysis is to find references to locations, such as + countries, towns, remarkable places etc., although the pipeline also runs general + named entity recognition and things like dependency parsing and part-of-speech tagging + on the side. Each input sentence can have zero to n locations in them. The locations are + lemmatized using the Voikko library. The first part of the geoparsing process is called (geo)tagging. + + The tagger results are gathered on a Pandas dataframe consisting of five columns, + with each analysis of a sentence on a single row. The dataframe is passed to + the (geo)coder, which attempts to return coordinate representations of the locations. + Currently, it relies on the GeoNames gazetteer, which is queried using a Python + module called GeoCoder. If locations are found, coordinate point representations + of them are returned as tuples or as Shapely points in WGS84 (EPSG:4326) CRS. + + """ + + def __init__(self, + pipeline_path="fi_geoparser", + use_gpu=False, + output_df=True, + verbose=True, + geocoder_url="http://vm5121.kaj.pouta.csc.fi:4000/v1/"): + """ + Parameters: + pipeline_path | String: name of the Spacy pipeline, which is called with spacy.load(). + "fi_geoparser", which is the installation name, by default, + however, a path to the files can also be provided. + + + use_gpu | Boolean: Whether the pipeline is run on the GPU (significantly faster, but often missing in + e.g. laptops) or CPU (slower but should run every time). Default True. + + output_df | Boolean: If True, the output will be a Pandas DataFrame. False does nothing currently. + + verbose | Boolean: Prints progress reports. Default True. + + geocoder_url : str, optional + URL for the Pelias geocoder instance. Default instance is maintained by the author for now and is located at "http://vm5121.kaj.pouta.csc.fi:4000/v1/". + + """ + + self.tagger = toponym_tagger(pipeline_path, use_gpu=use_gpu) + + self.coder = toponym_coder(geocoder_url) + + self.verbose=verbose + + + def geoparse(self, + texts, + ids=None, + explode_df=True, + return_shapely_points=False, + preprocess_texts=False, + drop_non_locations=False, + output='all', + filter_toponyms=True, + entity_tags=['LOC', 'FAC', 'GPE'], + geocoder_columns =['coordinates', 'gid', 'layer', 'label', 'bbox'], + geocoder_params = None): + """ + The whole geoparsing pipeline. + + Input: + texts | str or List[str]: A string or a list of input strings representing the text(s) to be processed. + + ids | str, int, float, or List[str/int/float], optional: Identifying element of each input, e.g., tweet id. + Must be the same length as texts. Default is None. + + explode_df | bool, optional: Whether to have each location "hit" on separate rows in the output. Default is True. + + return_shapely_points | bool, optional: Whether the coordinate points of the locations are regular tuples + or Shapely points. Default is False. + + preprocess_texts | bool, optional: Whether to preprocess the input texts before geoparsing. Default is False. + + drop_non_locations | bool, optional: Whether the sentences where no locations were found are included in the output. + Default is False (non-locs are included). + + output | str, optional: What's included in the output and in what format it is. Possible values: + 'all': All columns listed below as a dataframe + 'eupeg': EUPEG style JSON dump. Default is 'all'. + + filter_toponyms | bool, optional: Whether to filter out almost certain false positive toponyms. + Currently removes toponyms with a length less than 2. Default is True. + + entity_tags | List[str], optional: Which named entity tags to count as toponyms. Default is ['LOC', 'FAC', 'GPE']. + + geocoder_columns | List[str], optional: Columns to include in the geocoder results. Default is + ['coordinates', 'gid', 'layer', 'label', 'bbox']. + geocoder_params | Dict[str], optional: Parameters to limit the search to, for example, a certain country. Provide as {'parameter':'value'} dictionaries. For example: {'boundary.country':'FIN'} See https://github.com/pelias/documentation/blob/master/search.md for a full list of search parameters. + + Output: + Pandas DataFrame containing columns: + - input_text: the input sentence + - doc: Spacy doc object of the sent analysis. + - locations_found: Whether locations were found in the input sent. + - locations: locations in the input text, if found. + - loc_lemmas: lemmatized versions of the locations. + - loc_spans: the index of the start and end characters of the identified locations + in the input text string. + - input_order: the index of the inserted texts. i.e., the first text is 0, the second 1, etc. + Makes it easier to reassemble the results if they're exploded. + - names: versions of the names returned by querying GeoNames. + - coord_points: long/lat coordinate points in WGS84. + + Returns: + Pandas DataFrame or dict: Depending on the 'output' parameter, either a Pandas DataFrame is returned + containing the geoparsing results, or a dictionary in EUPEG style JSON format. + """ + + # Validate inputs + if not texts: + raise ValueError("Input texts are missing. Expecting a string or a list of strings.") + + # fix if someone passes just a string + if isinstance(texts, str): + texts = [texts] + + if output.lower() == 'eupeg': + explode_df = True + + # check that ids are in proper formats and lengths + if ids: + if isinstance(ids, (str, int, float)): + ids = [ids] + if len(ids) != len(texts): + raise ValueError("If ids are provided, the number of ids must be equal to the number of texts.") + + + if self.verbose: + print("Starting geotagging...") + t = time.time() + + # TOPONYM RECOGNITION + tag_results = self.tagger.tag_sentences(texts, ids, explode_df=explode_df, + drop_non_locs=drop_non_locations, + filter_toponyms=filter_toponyms, + entity_tags=entity_tags, + preprocess=preprocess_texts) + + if self.verbose: + successfuls = tag_results['toponyms_found'].tolist() + print("Finished geotagging after", round(time.time()-t, 2),"s.", successfuls.count(True), "location hits found.") + print("Starting geocoding...") + + # TOPONYM RESOLVING + # TODO: Reimplement shp_points + geocode_results = asyncio.run(self.coder.geocode_toponyms(tag_results['topo_lemmas'].tolist(), + columns=geocoder_columns)) + + + geocoded = pd.DataFrame(geocode_results) + + tag_results = tag_results.reset_index(drop=True) + + # concatenate (add the columns) from the geocoder results to the tagging results to produce the final result + results = pd.concat([tag_results, geocoded], axis=1) + + if self.verbose: + print("Finished geocoding, returning output.") + print("Total elapsed time:", round(time.time()-t, 2),"s") + + if output.lower() == 'eupeg': + return create_eupeg_json(results) + else: + return results + diff --git a/finger/output_formatter.py b/fingerGeoparser/output_formatter.py similarity index 100% rename from finger/output_formatter.py rename to fingerGeoparser/output_formatter.py diff --git a/fingerGeoparser/toponym_coder.py b/fingerGeoparser/toponym_coder.py new file mode 100644 index 0000000..5ba3206 --- /dev/null +++ b/fingerGeoparser/toponym_coder.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 24 18:53:37 2021 + +@author: Tatu Leppämäki +""" + +#import pandas as pd +import requests +import aiohttp +import asyncio +from tqdm.asyncio import tqdm + +#try: +# from shapely.geometry import Point +#except (ImportError, FileNotFoundError) as e: +# print("Unable to import Shapely. The geoparser works, but exporting to Shapely points is unavailable.") + + +class toponym_coder: + + def __init__(self, geocoder_url="http://vm5121.kaj.pouta.csc.fi:4000/v1/"): + """ + Calls a geocoder at the defined URL and returns a dictionary of responses. + """ + + self.geocoder_url = geocoder_url + assert self.geocoder_url, "A valid URL pointing to a running Pelias geocoding service must be provided." + + params = {'text':'Kamppi'} + res = requests.get(geocoder_url+'search', params=params) + assert res.status_code == 200, f"Geocoder from url {geocoder_url} did not return all OK. The path could be faulty or the service unavailable." + + async def geocode_toponyms(self, toponyms, columns=['coordinates', 'gid', 'layer', 'label', 'bbox'], params=None): + """Input: a list of toponyms: in default operation, this is a lemmatized versions of the toponyms recognized in the previous step. + TODO: EXPAND WITH COLUMNS AND PARAMS + Outputs: + UPDATE + Lonlats - list of coordinates in WGS84 longitude-latitude format + Labels - Textual descriptions of the toponym as returned by the geocoder + GIDS - An unique label that internally identifies the location. These are not stable and can change as the data in the geocoder is updated.""" + + lists = {key: list() for key in columns} + + responses = await self.batch_get(toponyms, params=params) + + for response in responses: + # for each response, check if the returned something (if it failed, it will not have 'features'). NB! The status will still be 200 for empty responses + if response and response['features']: + + for key in lists.keys(): + # loop through the requested columns, append values + # because the keys may not be at the base level, I need to do this clumsy hardcode for acquiring the correct values + + # related to geometry + if key in ('type', 'coordinates'): + lists[key].append(response['features'][0]['geometry'][key]) + # if not, it's probably at the properties level + elif key != 'bbox': + lists[key].append(response['features'][0]['properties'][key]) + # else a bounding box, which is at the base level + else: + lists[key].append(response['features'][0][key]) + else: + # if nothing was returned, appends Nones to all lists + for this_list in lists.values(): + this_list.append(None) + + return lists + + async def batch_get(self, topos, params=None): + """"This function forms the query urls, which are then asynchronoysly requested from the geocoder""" + # avoid badgering the server with too many requests at once -> leads to http errors + # this limits the concurrent connections to 15 (default 100) + connector = aiohttp.TCPConnector(limit=15) + + async with aiohttp.ClientSession(connector=connector) as session: + tasks = [] + for topo in topos: + if topo: + # if there's a lemmatized toponym, try searching with that. If not, return an empty string + url = f"{self.geocoder_url}search" + if params: + url_params = {'text': topo, **params} + else: + url_params = {'text': topo} + task = asyncio.ensure_future(self.get_response(session, url, params=url_params)) + tasks.append(task) + else: + task = asyncio.ensure_future(self.return_none()) + tasks.append(task) + # tqdm.gather works as a wrapper for asyncio.gather: it adds a progress bar + responses = await tqdm.gather(*tasks, desc="Geocoding...") + + return responses + + async def get_response(self, session, url, params=None): + """Setup one request. If the response code is something other than 200, print the error code.""" + async with session.get(url, params=params) as response: + if response.status != 200: + print(url, response.status) + return await response.json() + + async def return_none(self): + return "" +""" + def form_point(self, gn_result): + if self.shp_points: + return Point(float(gn_result.lng), float(gn_result.lat)) + else: + return (gn_result.lng, gn_result.lat) +""" diff --git a/fingerGeoparser/toponym_tagger.py b/fingerGeoparser/toponym_tagger.py new file mode 100644 index 0000000..10df125 --- /dev/null +++ b/fingerGeoparser/toponym_tagger.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 24 18:51:53 2021 + +@author: Tatu Leppämäki +""" + + +import spacy, pandas as pd, re + +from tqdm import tqdm + + +class toponym_tagger: + """ + This class initiates a Finnish NER tagger using Spacy. + + A NER tagger object can be used to tag location mentions in input texts. It accepts list of strings + as input and outputs a Pandas dataframe, which is then passed on to the geocoder. + + Parameters: + pipeline_path | String: name of the Spacy pipeline, which is called with spacy.load(). + "fi_geoparser", which is the installation name, by default, + however, a path to the files can also be provided. + + + use_gpu | Boolean: Whether the pipeline is run on the GPU (significantly faster, but often missing in + e.g. laptops) or CPU (slower but should run every time) + + output_df | Boolean: If True, the output will be a Pandas DataFrame. If False, a dictionary. + Currently, False does nothing. Left in if newer versions implement something + other than Pandas (just nested dictionaries?) + """ + + + def __init__(self, pipeline_path="fi_geoparser", use_gpu=True, + output_df=True): + if use_gpu: + resp = spacy.prefer_gpu() + + if use_gpu and not resp: + print("Using GPU failed, falling back on CPU...") + + self.output_df = output_df + + self.ner_pipeline = spacy.load(pipeline_path) + + def tag_sentences(self, input_texts, ids, explode_df=False, drop_non_locs=False, preprocess=False, + filter_toponyms = True, entity_tags=['LOC', 'FAC', 'GPE'] + ): + """Input: + texts | A string or a list of input strings: The input + *ids | String, int, float or a list: Identifying element of each input, e.g. tweet id. Must be + the same length as texts + *explode_df | Boolean: Whether to have each location "hit" on separate rows in the output. Default False + *drop_non_locations | Boolean: Whether the sentences where no locations were found are + included in the output. Default False. + *preprocess | Boolean: Whether to remove noise from the input texts, such as @-mentions and urls. + *filter_toponyms | Boolean: Whether to filter out almost certain false positive toponyms. + Currently removes toponyms with length less than 2. Default True. + + Output: Pandas DF containing columns: + 1. input_text: the input sentence | String + 2. doc: Spacy doc object of the sent analysis. See https://spacy.io/api/doc | Doc + 3. toponyms_found: Whether locations were found in the input sent | Bool + 4. locations: locations in the input text, if found | list of strings or None + 5. topo_lemmas: lemmatized versions of the locations | list of strings or None + 6. topo_spans: the index of the start and end characters of the identified + locations in the input text string | tuple + 7. input_order: the index of the inserted texts. i.e. the first text is 0, the second 1 etc. + Makes it easier to reassemble the results if they're exploded | int' + *8. id: The identifying element tied to each input text, if provided | string, int, float + """ + assert input_texts, "No input provided. Make sure to input a list of strings." + tagged_sentences = [] + + self.explode_df = explode_df + + self.drop_non_locs = drop_non_locs + + self.filter_toponyms = filter_toponyms + + self.entity_tags = entity_tags + + # apply preprocessing step, if requested + if preprocess: + input_texts = [self.preprocess_sent(sent) for sent in tqdm(input_texts, desc="Preprocessing input...")] + + # run spacy pipeline + tag_results = list(tqdm(self.ner_pipeline.pipe(input_texts), total=len(input_texts), desc="Running toponym recognition...")) + + # gather the wanted features from spacy doc objects into a dictionary of lists + tagged_sentences = [self.get_features(sent) for sent in tag_results] + + return self.to_dataframe(tagged_sentences, ids) + """ + if self.output_df: + return self.to_dataframe(tagged_sentences) + else: + return tagged_sentences + """ + + def get_features(self, doc): + """Input: a sentence to tag (string) + Output: a dictionary with the same variables as listed in 'tag_sentences'""" + #doc = self.ner_pipeline(sent) + + # if the tagger created an output, i.e. at least one of the words in the input + # was tagged, create an output of that. Otherwise, return a mostly empty dict + #docs = [] + toponyms = [] + topo_labels = [] + topo_lemmas = [] + topo_spans = [] + toponyms_found = False + + # gather the NER labels found to a list + labels = [ent.label_ for ent in doc.ents] + + + # looping through the entities, collecting required information + # namely, the raw toponym text, its lemmatized form and the span as tuple + for ent in doc.ents: + if ent.label_ in self.entity_tags: + # apply filtering if requested + if self.filter_toponyms: + # length filtering + if len(ent.text)>1: + toponyms.append(ent.text) + topo_labels.append(ent.label_) + + # remove hashtags, which mark word boundaries in compound words + lemma = ent.lemma_.replace("#","") + # in addition, remove punctuation characters, if they were captured by the tagger + # included are various quotation marks + lemma = re.sub(r'[.?!;:\'"“”‘’]', '', lemma) + + # add lemmatized versions of the toponyms to list + topo_lemmas.append(lemma) + # spans; character start and end locations + topo_spans.append((ent.start_char, ent.end_char)) + toponyms_found = True + else: + toponyms.append(ent.text) + topo_labels.append(ent.label_) + topo_lemmas.append(ent.lemma_.replace("#","")) + topo_spans.append((ent.start_char, ent.end_char)) + toponyms_found = True + #docs.append(doc) + + if toponyms_found: + doc_results = {'input_text': doc.text, 'toponyms': toponyms, 'topo_lemmas': topo_lemmas, + 'topo_labels':topo_labels, 'topo_spans': topo_spans,'toponyms_found': toponyms_found} + else: + doc_results = {'input_text': doc.text, 'locations': None, 'topo_lemmas': None, + 'topo_labels':None,'topo_spans': None, 'toponyms_found': toponyms_found} + + return doc_results + + def preprocess_sent(self, sent): + """Optionally cleans up noise (especially prominent in social media posts): removes emojis (TODO), mentions (@xyz), hashtags (#, but not the content) and URLs. + Based on work by Hiippala et al. 2020: Mapping the languages of Twitter in Finland: richness and diversity in space and time. See: https://zenodo.org/record/4279402 + """ + # Remove all mentions (@) in the input + sent = re.sub(r'@\S+ *', '', sent) + + # remove hashes from hashtags + sent = sent.replace('#', '') + + # remove old school heart emojis <3 + sent = sent.replace('<3', '') + + # remove ampersand (&), which may be followed by 'amp' + sent = re.sub(r'&|&', '', sent) + + # remove URL's: i.e. remove everything that follows http(s) until a whitespace + sent = re.sub(r'http[s]?://\S+', "", sent) + + return sent + + + def to_dataframe(self, results, ids): + df = pd.DataFrame(results) + + if ids: + df['id'] = ids + + df['input_order'] = df.index + + # split the possible list contents into multiple rows + if self.explode_df: + df = df.apply(lambda x: x.explode() if x.name in ['toponyms', 'topo_labels', 'topo_lemmas', 'topo_spans'] else x) + if self.drop_non_locs: + return self.drop_non_locations(df) + else: + return df + + def drop_non_locations(self, df): + """Remove input strings / rows where the tagger did not find any toponyms.""" + df = df[df['toponyms_found']] + return df diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ef6e9c4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pandas +spacy==3.5.2 +spacy-transformers==1.1.9 +tqdm +aiohttp diff --git a/setup.cfg b/setup.cfg index 61ea5db..ffb148a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = fingerGeoparser -version = 0.1.0 +version = 0.2.0 author = Tatu Leppämäki author_email = tatu.leppamaki@helsinki.fi url = https://github.com/Tadusko/fi-geoparser @@ -13,11 +13,12 @@ license = MIT License [options] install_requires = pandas - click==7.1.2 - spacy==3.0.8 - spacy_transformers==1.0.0 - libvoikko - geocoder - voikko + spacy==3.5.2 + spacy_transformers==1.1.9 + tqdm + aiohttp + + package_dir = - = finger + = . + diff --git a/tests/test_geoparsing.py b/tests/test_geoparsing.py new file mode 100644 index 0000000..5c88ff5 --- /dev/null +++ b/tests/test_geoparsing.py @@ -0,0 +1,14 @@ +import sys +print(sys.path) + +from fingerGeoparser import geoparser + +def test_constructor(): + gp = geoparser.geoparser(pipeline_path="fi_core_news_sm") + assert isinstance(gp, geoparser.geoparser) + +def test_method(): + gp = geoparser.geoparser(pipeline_path="fi_core_news_sm") + res = gp.geoparse(["Helsinki on kaunis tänään", "Paris Hilton mokasi."]) + + #assert isinstance(gp, geoparser)