jurismarches · PierrickLP · Nov 6, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,11 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
+## [5.2.3] - 2024-11-04
+
+- "Pacific Island" no longer matches Iceland.
+- "Island" no longer matches Iceland when language is not set.
+
 ## [5.2.2] - 2024-08-23
 
 - Detection for "moldavie" as "MD" in French.

diff --git a/geoconvert/convert.py b/geoconvert/convert.py
@@ -9,6 +9,7 @@
     DE_POSTCODE_RANGE,
     NUTS_CODES_BY_COUNTRY,
     all_nuts_regex,
+    ambiguous_countries,
     br_postcode_regex,
     br_state_code_regex,
     br_state_name_regex,
@@ -413,8 +414,15 @@ def _full_name_to_country_code(text, lang, language_to_full_names):
         # If the language is unknown, just do not use any language.
         language_to_full_names = {}
 
-    for lang, full_names in language_to_full_names.items():
-        country_code = _full_name_to_country_code_for_lang(text, lang, full_names)
+    for language, full_names in language_to_full_names.items():
+        # When language is not set, we remove ambiguous countries names from the countries list
+        if not lang:
+            full_names = {
+                name: code
+                for name, code in full_names.items()
+                if name not in ambiguous_countries
+            }
+        country_code = _full_name_to_country_code_for_lang(text, language, full_names)
         if country_code:
             return country_code
 

diff --git a/geoconvert/data/__init__.py b/geoconvert/data/__init__.py
@@ -1,5 +1,6 @@
 from .capitals import capitals_de, capitals_en, capitals_fr, language_to_capital_names
 from .countries import (
+    ambiguous_countries,
     countries_de,
     countries_en,
     countries_fr,

diff --git a/geoconvert/data/countries.py b/geoconvert/data/countries.py
@@ -48,6 +48,17 @@
     "haute vienne": "FR",  # fr
 }
 
+# We want to make sure pacific island is not registered as IS (Iceland) in German.
+# Other languages doesn't have issues with it and we shouldn't prevent them from retrieving
+# other countries like FJ (Fiji) in "Fiji/Pacific Island"
+special_countries_de = {
+    "pacific island": None,  # en
+}
+
+# We register here names matching common words in other languages to prevent
+# matching them when language is not defined
+ambiguous_countries = ["island"]
+
 
 countries_fr = {
     **special_countries,
@@ -597,6 +608,7 @@
 
 
 countries_de = {
+    **special_countries_de,
     **special_countries,
     **{
         "agypten": "EG",

diff --git a/geoconvert/utils.py b/geoconvert/utils.py
@@ -40,10 +40,12 @@ def safe_string(text):
     'how are you'
     >>> safe_string("l’ocean")
     "l'ocean"
+    >>> safe_string('Fiji/Pacific Island')
+    'fiji pacific island'
     """
     text = remove_accents(text)
-    # Replace "-" and ":" with a whitespace
-    text = re.sub(r"[-:]", " ", text)
+    # Replace "-", ":" and "/" with a whitespace
+    text = re.sub(r"[-:/]", " ", text)
     # Replace weird '
     text = re.sub(r"[ʼ]", "'", text)
     # Only keep word or space characters as well as "_", and "'".

diff --git a/tests/test_countries.py b/tests/test_countries.py
@@ -65,9 +65,12 @@ class TestCountries:
             ("Solomon Islands", {}, "SB"),  # en
-            ("Solomon Islands", {}, "SB"),  # en
+            ("Solomon Islands", {}, "SB"),  # en
+             # However, in cases where island is singular instead of plural,
+            # there can be confusion.
+            ("Solomon Island Nationals", {}, None),  # en
-            ("Solomon Islands", {}, "SB"),  # en
+            ("Solomon Islands", {}, "SB"),  # en
+             # However, in cases where island is singular instead of plural,
+            # there can be confusion.
+            ("Solomon Island Nationals", {}, None),  # en
             ("Prince Edward Island", {}, "CA"),  # en
             ("Rhode Island", {}, "US"),  # en
+            ("Pacific island", {}, None),  # en
+            ("Fiji/Pacific island", {}, "FJ"),  # en
+            ("island", {}, None),  # en
             # However, in cases where island is singular instead of plural,
             # there can be confusion.
-            ("Solomon Island Nationals", {}, "IS"),  # en
+            ("Solomon Island Nationals", {}, None),  # en
             # Any capitalization for lang works
             ("Germany", {"lang": "en"}, "DE"),
             ("Germany", {"lang": "En"}, "DE"),
@@ -164,6 +167,7 @@ def test_country_name_to_country_code_fr(self, input_data, expected):
             ("state of palestine", "PS"),
             ("palestine, state of", "PS"),
             ("Irak", None),
+            ("island", None),
         ],
     )
     def test_country_name_to_country_code_en(self, input_data, expected):
@@ -191,6 +195,8 @@ def test_country_name_to_country_code_en(self, input_data, expected):
             ("   Land der Hinrichtung : Deutschland", "DE"),
             ("Dschibuti-Stadt", "DJ"),
             ("elfenbeinküste und ecuador    ", "CI"),
+            ("Fiji/Pacific island", None),  # en
+            ("island", "IS"),
         ],
     )
     def test_country_name_to_country_code_de(self, input_data, expected):