Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🗺 Include GeoNames to verify admin boundaries #73

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ services:
- POSTGRES_USER=kuwala
- POSTGRES_PASSWORD=password
- POSTGRES_DB=kuwala
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3,unaccent,fuzzystrmatch
ports:
- '5432:5432'
volumes:
Expand Down
8 changes: 8 additions & 0 deletions kuwala/common/python_utils/src/file_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas


def txt_to_csv(file_path):
read_file = pandas.read_csv(
file_path, delimiter="\t", header=None, low_memory=False
)
read_file.to_csv(file_path.replace(".txt", ".csv"), index=None, header=False)
24 changes: 24 additions & 0 deletions kuwala/core/database/importer/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,30 @@ CREATE TABLE IF NOT EXISTS admin_boundary (
geometry geometry
);

-- Creation of admin_boundary_geonames_cities table

CREATE TABLE IF NOT EXISTS admin_boundary_geonames_cities (
geoname_id text NOT NULL PRIMARY KEY,
name text NOT NULL,
ascii_name text,
alternate_names text[],
latitude decimal NOT NULL,
longitude decimal NOT NULL,
feature_class text NOT NULL,
feature_code text,
country_code text,
alternate_country_codes text[],
admin_1_code text,
admin_2_code text,
admin_3_code text,
admin_4_code text,
population integer NOT NULL,
elevation integer,
digital_elevation_model integer NOT NULL,
timezone text NOT NULL,
modification_date date NOT NULL
);

-- Creation of population_density table

CREATE TABLE IF NOT EXISTS population_density (
Expand Down
82 changes: 64 additions & 18 deletions kuwala/core/database/importer/src/admin_boundary_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,18 @@
from pyspark.sql.functions import lit


def import_admin_boundaries(
def import_osm_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
file_path,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

if not os.path.exists(file_path):
logging.warning("No admin boundaries file available. Skipping import.")
logging.warning("No OSM admin boundaries file available. Skipping import.")
return

data = (
Expand All @@ -58,6 +43,67 @@ def import_admin_boundaries(
path_to_query_file="../sql/create_admin_boundary_geometries.sql",
)


def import_geonames_cities(spark, database_url, database_properties, file_path):
if not os.path.exists(file_path):
logging.warning("No GeoNames city names available. Skipping import.")
return

data = spark.read.parquet(file_path)

data.write.option("truncate", True).jdbc(
url=database_url,
table="admin_boundary_geonames_cities",
mode="overwrite",
properties=database_properties,
)


def import_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path_geonames_cities = os.path.join(
script_dir, "../../../../tmp/kuwala/admin_boundary_files/cities_500.parquet"
)
file_path_osm_admin_boundaries = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

import_geonames_cities(
spark=spark,
database_url=database_url,
database_properties=database_properties,
file_path=file_path_geonames_cities,
)
import_osm_admin_boundaries(
spark=spark,
database_host=database_host,
database_port=database_port,
database_name=database_name,
database_url=database_url,
database_properties=database_properties,
country=country,
file_path=file_path_osm_admin_boundaries,
)

logging.info(
f"Successfully imported admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent} after '
Expand Down
14 changes: 12 additions & 2 deletions kuwala/core/database/transformer/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Name your project! Project names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
Expand All @@ -25,6 +24,8 @@ clean-targets: # directories to be removed by `dbt clean`
- "dbt_modules"

vars:
# Country to match cities
country: 'MT'
# POI brand
focus_brand: 'focus_brand_id_prefix'
# Aggregation grid
Expand All @@ -43,10 +44,19 @@ vars:

models:
kuwala_core_transformer:
admin_boundaries:
cities:
+materialized: table
poi:
poi:
+materialized: table
poi_address_city:
+materialized: table
poi_address_country:
+materialized: table
poi_aggregated:
+materialized: table
poi_matched:
+materialized: table
poi_popularity_time_series:
+materialized: table
+materialized: table
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{% macro get_all_versions_of_city_names(country_code) %}
{% set query %}
SELECT DISTINCT geoname_id, unnest(ascii_name || alternate_names) AS name
FROM admin_boundary_geonames_cities
WHERE country_code = '{{ country_code }}'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% macro match_admin_boundaries_with_city_names(country_code) %}
{% set all_versions_of_city_names = get_all_versions_of_city_names(country_code) %}

{% set query %}
SELECT id, geoname_id, levenshtein(ab.name, avocn.name) AS levenshtein_distance
FROM admin_boundary AS ab LEFT JOIN ({{ all_versions_of_city_names }}) AS avocn ON
ab.name = avocn.name OR
unaccent(ab.name) = avocn.name OR
ab.name LIKE avocn.name || '%' OR
unaccent(ab.name) LIKE avocn.name || '%'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
SELECT city_name, geometry
FROM {{ ref('city_candidates') }}
INNER JOIN (
SELECT city_name AS city_name_best_match, MIN(min_levenshtein_distance) AS min_levenshtein_distance_best_match
FROM {{ ref('city_candidates') }}
GROUP BY city_name
) AS best_city_candidates ON
city_candidates.city_name = best_city_candidates.city_name_best_match AND
city_candidates.min_levenshtein_distance = best_city_candidates.min_levenshtein_distance_best_match
WHERE st_contains(geometry, st_setsrid(st_makepoint(candidate_longitude, candidate_latitude), 4326))
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
SELECT
abgc.ascii_name AS city_name,
ab.name AS admin_boundary_name,
min_levenshtein_distance,
ab.geometry AS geometry,
abgc.latitude AS candidate_latitude,
abgc.longitude AS candidate_longitude
FROM (
SELECT id, geoname_id, levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
) AS matched_cities
INNER JOIN (
SELECT id AS id_best_match, MIN(levenshtein_distance) AS min_levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
GROUP BY id
) AS matched_cities_min_levenshtein_distances ON
matched_cities.id = matched_cities_min_levenshtein_distances.id_best_match AND
matched_cities.levenshtein_distance = matched_cities_min_levenshtein_distances.min_levenshtein_distance
LEFT JOIN admin_boundary_geonames_cities AS abgc USING (geoname_id)
LEFT JOIN admin_boundary AS ab USING (id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

version: 2

models:
# Cities
- name: cities
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: geometry
description: 'Geometry based on matched admin boundary'
tests:
- not_null
# City candidates
- name: city_candidates
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: admin_boundary_name
description: 'Name of matched admin boundary'
tests:
- not_null
- name: min_levenshtein_distance
description: 'Levenshtein distance of GeoNames city name and admin boundary name'
tests:
- not_null
- name: geometry
description: 'Geometry of matched admin boundary'
tests:
- not_null
2 changes: 1 addition & 1 deletion kuwala/core/database/transformer/dbt/models/poi/poi.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SELECT
poi_matched.*,
poi_address_city.name AS poi_address_city,
poi_address_city.city_name AS poi_address_city,
poi_address_country.name AS poi_address_country,
h3_to_parent(poi_h3_index::h3index, 8) AS poi_h3_index_res_8,
h3_to_parent(poi_h3_index::h3index, 9) AS poi_h3_index_res_9,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
SELECT poi_id, name, id
FROM admin_boundary AS ab, {{ ref('poi_matched') }} AS poi
WHERE
kuwala_admin_level = (SELECT max(kuwala_admin_level) FROM admin_boundary) AND
st_contains(ab.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
SELECT DISTINCT poi_id, city_name
FROM {{ ref('cities') }} AS cities, {{ ref('poi_matched') }} AS poi
WHERE st_contains(cities.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
12 changes: 4 additions & 8 deletions kuwala/core/database/transformer/dbt/models/poi/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,25 +98,21 @@ models:
- accepted_values:
values: [ 15 ]
- name: poi_address_city
description: 'Name of the admin boundary with the highest kuwala_admin_level containing POI coordinates'
description: 'Name of the city containing POI coordinates based on matched GeoNames and OSM data'
- name: poi_address_country
description: 'Name of admin boundary having kuwala_admin_level 1 containing POI coordinates'

# POI city based on admin boundary
- name: poi_address_city
description: 'Admin boundary name with highest admin level containing POI coordinates'
description: 'City containing POI coordinates based on matched GeoNames and OSM data'
columns:
- name: poi_id
description: 'Unique ID referring to a POI'
tests:
- unique
- not_null
- name: name
description: 'Name of matching admin boundary'
tests:
- not_null
- name: id
description: 'ID of matching admin boundary'
- name: city_name
description: 'Name of matching city'
tests:
- not_null

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,4 +391,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading