Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid disambiguation from street names and city names in fr_dept_name_to_dept_code #36

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

### Fixed

- Avoid disambiguation from street names and city names in fr_dept_name_to_dept_code

## [5.3.1] - 2024-11-06

### Added
Expand Down
27 changes: 27 additions & 0 deletions geoconvert/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,38 @@ def fr_postcode_to_dept_code(text):
# Keep backward compatibility
address_to_zipcode = fr_postcode_to_dept_code

# Avoid "rue de Paris" situations
fr_street_names_re = "|".join(
(
"boulevard",
"avenue",
"chemin",
"rue",
"route",
"impasse",
"place",
"passage",
"ruelle",
"quai",
"all.e",
)
)
fr_street_name_cleaning_re = re.compile(
rf"\b({fr_street_names_re})\b[^\d\(,\n-]{{,20}}", flags=re.I
)
# Avoid "Ville-sur-Loire" situations
fr_town_name_cleaning_re = re.compile(r"\w+.\b(sous|sur|val\Wde)\b.(\w+)", flags=re.I)


def fr_dept_name_to_dept_code(text):
"""
Return the departement number from the departement name
"""
# Avoid "rue de Paris" situations
text = fr_street_name_cleaning_re.sub("", text)
# Avoid "Ville-sur-Loire" situations
text = fr_town_name_cleaning_re.sub("", text)

# There is no space in french dept names, but hyphens instead.
text = safe_string(text).replace(" ", "-")

Expand Down
46 changes: 36 additions & 10 deletions tests/test_subdivisions/test_france.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ class TestFrance:
("Loire Atlanti)que", "44"),
("Yonne", "89"),
("Saint Pierre et Miquelon", "975"),
("Tout savoir sur Saint Barthélemy", "977"),
("Tout savoir sur saint-barthelemy", "977"),
("Tout savoir sur saint Barthélémy", "977"),
("Tout savoir à propos de Saint Barthélemy", "977"),
("Tout savoir à propos de saint-barthelemy", "977"),
("Tout savoir à propos de saint Barthélémy", "977"),
# Region names
("Pays de la Loire", "44"),
# Special cases for the old French région "Centre"
Expand All @@ -117,9 +117,23 @@ class TestFrance:
# Both dept name and region name
("Guyane", "973"),
("Guadeloupe", "971"),
# Avoid disambiguations
# due to street names
("Rue de la Réunion 61000 Alencon", "61"), # "réunion" could mean 974
("rue de Paris, Nantes", None), # "paris" could mean 75
("Rue de l'Orne, 44800 Saint-Herblain (44)", "44"), # "Orne" could be 61
# due to city names
("Sully sur Loire (Loiret)", "45"), # "Loire" could mean 42
("Gournay-sous-Marne (Seine saint Denis)", "93"), # "Marne" could be 51
# due to phrase
("en val-de-Loire", None), # "Loire" could be 42
# The current strategy has drawbacks
("Tout savoir sur Saint Barthélemy", None),
# There can be some mistakes, that we may want to fix one day.
("Vallées de l'Orne et de l'Odon", "61"), # in 14
("commune de Saint-Vincent-des-Landes", "40"), # in 44
# In this case, we could look for 2 or 3 digit
("Rue de l'Orne, Saint-Herblain (44)", "61"),
("CHU 44", None), # in 44
],
)
def test_fr_address_to_dept_code(self, input_data, expected):
Expand Down Expand Up @@ -228,12 +242,24 @@ def test_fr_postcode_to_dept_code(self, input_data, expected):
("Loire Atlanti)que", "44"),
("Yonne", "89"),
("Saint Pierre et Miquelon", "975"),
("Tout savoir sur Saint Barthélemy", "977"),
("Tout savoir sur saint-barthelemy", "977"),
("Tout savoir sur saint Barthélémy", "977"),
# There may be some mistakes, so be careful what is passed
("Rue de la Réunion, 75000 Paris", "974"),
("Rue de l'Orne, 44800 Saint-Herblain", "61"),
("Tout savoir à propos de Saint Barthélemy", "977"),
("Tout savoir à propos de saint-barthelemy", "977"),
("Tout savoir à propos de saint Barthélémy", "977"),
# Avoid disambiguations
# due to street names
("Rue de la Réunion 61000 Alencon", None), # "réunion" could mean 974
("rue de Paris, Nantes", None), # "paris" could mean 75
("Rue de l'Orne, Saint-Herblain (44)", None), # "Orne" could be 61
# due to city names
("Sully sur Loire (Loiret)", "45"), # "Loire" could mean 42
("Gournay-sous-Marne (Seine saint Denis)", "93"), # "Marne" could be 51
# due to phrase
("en val-de-Loire", None), # "Loire" could be 42
# The current strategy has drawbacks
("Tout savoir sur Saint Barthélemy", None),
# There can be some mistakes, that we may want to fix one day.
("Vallées de l'Orne et de l'Odon", "61"), # in 14
("commune de Saint-Vincent-des-Landes", "40"), # in 44
],
)
def test_fr_dept_name_dept_code(self, input_data, expected):
Expand Down
Loading