From 2caaf863a26661cf2aa4e9d54fc074242855ec5e Mon Sep 17 00:00:00 2001 From: George Katsikas <giorgakis.katsikas@gmail.com> Date: Wed, 5 Jul 2023 14:53:04 +0200 Subject: [PATCH] refactor unaccent utility add latinisation utility --- scipost_django/common/constants.py | 91 ++++++++---------------------- scipost_django/common/utils.py | 59 ++++++++++++++++--- 2 files changed, 77 insertions(+), 73 deletions(-) diff --git a/scipost_django/common/constants.py b/scipost_django/common/constants.py index 31499ef62..e925a1d65 100644 --- a/scipost_django/common/constants.py +++ b/scipost_django/common/constants.py @@ -17,72 +17,31 @@ CHARACTER_ALTERNATIVES = { "Ãœ": "Ue", } -CHARACTER_UNACCENTED = { - "à ": "a", - "À": "A", - "á": "a", - "Ã": "A", - "â": "a", - "Â": "A", - "ä": "ae", - "Ä": "Ae", - "ã": "a", - "Ã": "A", - "Ã¥": "a", - "Ã…": "A", - "ç": "c", - "Ç": "C", - "ć": "c", - "Ć": "c", - "é": "e", - "É": "E", - "è": "e", - "È": "E", - "ê": "e", - "Ê": "E", - "ë": "e", - "Ë": "E", - "Ä™": "e", - "Ę": "E", - "Ã": "i", - "Ã": "I", - "ì": "i", - "ÃŒ": "I", - "î": "i", - "ÃŽ": "I", - "ï": "i", - "Ã": "I", - "Å‚": "l", - "Å": "L", - "ñ": "n", - "Ñ": "N", - "Å„": "n", - "Ń": "N", - "ó": "o", - "Ó": "O", - "ò": "o", - "Ã’": "O", - "ô": "o", - "Ô": "O", - "ö": "oe", - "Ö": "Oe", - "õ": "o", - "Õ": "O", +# Character latinisations are used to convert foreign letters +# to their latinised equivalents / lookalikes. +CHARACTER_LATINISATIONS = { + "æ": "ae", + "Æ": "Ae", + "Å“": "oe", + "Å’": "Oe", + "ß": "ss", + "ð": "d", + "Ã": "D", "ø": "o", "Ø": "O", - "Å›": "s", - "Åš": "S", - "ß": "ss", - "ú": "u", - "Ú": "U", - "ù": "u", - "Ù": "U", - "û": "u", - "Û": "U", - "ü": "ue", - "Ãœ": "Ue", - "ź": "z", - "Ź": "Z", - "ż": "z", - "Å»": "Z", + "Å‚": "l", + "Å": "L", + "ij": "ij", + "IJ": "IJ", + "Å‹": "ng", + "ÅŠ": "Ng", + "ȶ": "t", + "È·": "j", + "ɉ": "j", + "É‹": "q", + "µ": "u", + "√": "v", + "aÌ": "a", + "ą": "a", + "ı": "i", } diff --git a/scipost_django/common/utils.py b/scipost_django/common/utils.py index 5d3e35bd2..9e5b7fd34 100644 --- a/scipost_django/common/utils.py +++ b/scipost_django/common/utils.py @@ -9,17 +9,62 @@ from django.core.mail import EmailMultiAlternatives from django.db.models import Q from django.template import loader -from .constants import CHARACTER_ALTERNATIVES, CHARACTER_UNACCENTED +from .constants import CHARACTER_ALTERNATIVES, CHARACTER_LATINISATIONS +import unicodedata -def unaccent(text): + +def unaccent(text: str) -> str: + """ + Remove accented characters in the given string (e.g. é -> e), + with the exception of the German umlauts (e.g. ö -> oe). + """ + UMLAUT = "\u0308" + + unaccented_text = "" + for char in unicodedata.normalize("NFD", text): + char_category = unicodedata.category(char) + + if char_category != "Mn": + unaccented_text += char + elif char == UMLAUT: + unaccented_text += "e" + + return unaccented_text + + +def latinise(text: str) -> str: """ - Replace accented characters by unaccented ones. + Convert accented characters in the given string to their + latinised equivalents / lookalikes (e.g. ö -> o). """ - unaccented = text - for key, val in CHARACTER_UNACCENTED.items(): - unaccented = unaccented.replace(key, val) - return unaccented + latinised_text = "" + for char in unicodedata.normalize("NFD", text): + char_category = unicodedata.category(char) + + translated_char = char + is_latin = ord(char) < 128 + + # Keep spaces and dashes + if char in [" ", "-", "–"]: + pass + # Remove apostrophes and parentheses + elif char in ["'", "’", "(", ")"]: + translated_char = "" + # Translate only letters, symbols and punctuation + # skipping numbers and other characters (e.g. diacritics) + elif char_category[0] in ["L", "S", "P"] and not is_latin: + translated_char = CHARACTER_LATINISATIONS.get(char, "") + + # Remove everything not in the ASCII range + translated_char = translated_char.encode("ascii", "ignore").decode("utf-8") + + latinised_text += translated_char + + # Remove multiple spaces + latinised_text = " ".join(latinised_text.split()) + + return latinised_text def alternative_spellings(text): -- GitLab