Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

'Pacific Island' no longer matches Iceland (IS) #34

Merged
merged 4 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

## [5.3.0] - 2024-11-04

### Changed

- "Island" no longer matches Iceland when language is not set.

### Fixed

- "Pacific Island" no longer matches Iceland.

## [5.2.2] - 2024-08-23

- Detection for "moldavie" as "MD" in French.
Expand Down
10 changes: 8 additions & 2 deletions geoconvert/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DE_POSTCODE_RANGE,
NUTS_CODES_BY_COUNTRY,
all_nuts_regex,
ambiguous_country_names,
br_postcode_regex,
br_state_code_regex,
br_state_name_regex,
Expand Down Expand Up @@ -413,8 +414,13 @@ def _full_name_to_country_code(text, lang, language_to_full_names):
# If the language is unknown, just do not use any language.
language_to_full_names = {}

for lang, full_names in language_to_full_names.items():
country_code = _full_name_to_country_code_for_lang(text, lang, full_names)
for language, full_names in language_to_full_names.items():
# When no language is set, remove ambiguous country names from the countries list.
if not lang:
full_names = full_names.copy()
for name in ambiguous_country_names:
full_names.pop(name, None)
country_code = _full_name_to_country_code_for_lang(text, language, full_names)
if country_code:
return country_code

Expand Down
1 change: 1 addition & 0 deletions geoconvert/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .capitals import capitals_de, capitals_en, capitals_fr, language_to_capital_names
from .countries import (
ambiguous_country_names,
countries_de,
countries_en,
countries_fr,
Expand Down
12 changes: 12 additions & 0 deletions geoconvert/data/countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,17 @@
"haute vienne": "FR", # fr
}

# We want to make sure pacific island is not registered as IS (Iceland) in German.
# Other languages doesn't have issues with it and we shouldn't prevent them from retrieving
# other countries like FJ (Fiji) in "Fiji/Pacific Island"
special_countries_de = {
"pacific island": None, # en
}
mounasb marked this conversation as resolved.
Show resolved Hide resolved

# We register here names matching common words in other languages to prevent
# matching them when language is not defined
ambiguous_country_names = ["island"]


countries_fr = {
**special_countries,
Expand Down Expand Up @@ -597,6 +608,7 @@


countries_de = {
**special_countries_de,
**special_countries,
**{
"agypten": "EG",
Expand Down
6 changes: 4 additions & 2 deletions geoconvert/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ def safe_string(text):
'how are you'
>>> safe_string("l’ocean")
"l'ocean"
>>> safe_string('Fiji/Pacific Island')
'fiji pacific island'
"""
text = remove_accents(text)
# Replace "-" and ":" with a whitespace
text = re.sub(r"[-:]", " ", text)
# Replace "-", ":" and "/" with a whitespace
text = re.sub(r"[-:/]", " ", text)
# Replace weird '
text = re.sub(r"[ʼ]", "'", text)
# Only keep word or space characters as well as "_", and "'".
Expand Down
12 changes: 9 additions & 3 deletions tests/test_countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,14 @@ class TestCountries:
("Heard Island", {}, "HM"), # en
("Norfolk Island", {}, "NF"), # en
("Solomon Islands", {}, "SB"), # en
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
("Solomon Islands", {}, "SB"), # en
("Solomon Islands", {}, "SB"), # en
# However, in cases where island is singular instead of plural,
# there can be confusion.
("Solomon Island Nationals", {}, None), # en

("Prince Edward Island", {}, "CA"), # en
("Rhode Island", {}, "US"), # en
# However, in cases where island is singular instead of plural,
# there can be confusion.
("Solomon Island Nationals", {}, "IS"), # en
("Solomon Island Nationals", {}, None), # en
("Prince Edward Island", {}, "CA"), # en
("Rhode Island", {}, "US"), # en
("Pacific island", {}, None), # en
("Fiji/Pacific island", {}, "FJ"), # en
("island", {}, None), # en
# Any capitalization for lang works
("Germany", {"lang": "en"}, "DE"),
("Germany", {"lang": "En"}, "DE"),
Expand Down Expand Up @@ -164,6 +167,7 @@ def test_country_name_to_country_code_fr(self, input_data, expected):
("state of palestine", "PS"),
("palestine, state of", "PS"),
("Irak", None),
("island", None),
],
)
def test_country_name_to_country_code_en(self, input_data, expected):
Expand Down Expand Up @@ -191,6 +195,8 @@ def test_country_name_to_country_code_en(self, input_data, expected):
(" Land der Hinrichtung : Deutschland", "DE"),
("Dschibuti-Stadt", "DJ"),
("elfenbeinküste und ecuador ", "CI"),
("Fiji/Pacific island", None), # en
("island", "IS"),
],
)
def test_country_name_to_country_code_de(self, input_data, expected):
Expand Down
Loading