From 2caaf863a26661cf2aa4e9d54fc074242855ec5e Mon Sep 17 00:00:00 2001
From: George Katsikas <giorgakis.katsikas@gmail.com>
Date: Wed, 5 Jul 2023 14:53:04 +0200
Subject: [PATCH] refactor unaccent utility add latinisation utility

---
 scipost_django/common/constants.py | 91 ++++++++----------------------
 scipost_django/common/utils.py     | 59 ++++++++++++++++---
 2 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/scipost_django/common/constants.py b/scipost_django/common/constants.py
index 31499ef62..e925a1d65 100644
--- a/scipost_django/common/constants.py
+++ b/scipost_django/common/constants.py
@@ -17,72 +17,31 @@ CHARACTER_ALTERNATIVES = {
     "Ü": "Ue",
 }
 
-CHARACTER_UNACCENTED = {
-    "à": "a",
-    "À": "A",
-    "á": "a",
-    "Á": "A",
-    "â": "a",
-    "Â": "A",
-    "ä": "ae",
-    "Ä": "Ae",
-    "ã": "a",
-    "Ã": "A",
-    "Ã¥": "a",
-    "Ã…": "A",
-    "ç": "c",
-    "Ç": "C",
-    "ć": "c",
-    "Ć": "c",
-    "é": "e",
-    "É": "E",
-    "è": "e",
-    "È": "E",
-    "ê": "e",
-    "Ê": "E",
-    "ë": "e",
-    "Ë": "E",
-    "Ä™": "e",
-    "Ę": "E",
-    "í": "i",
-    "Í": "I",
-    "ì": "i",
-    "Ì": "I",
-    "î": "i",
-    "ÃŽ": "I",
-    "ï": "i",
-    "Ï": "I",
-    "Å‚": "l",
-    "Ł": "L",
-    "ñ": "n",
-    "Ñ": "N",
-    "Å„": "n",
-    "Ń": "N",
-    "ó": "o",
-    "Ó": "O",
-    "ò": "o",
-    "Ã’": "O",
-    "ô": "o",
-    "Ô": "O",
-    "ö": "oe",
-    "Ö": "Oe",
-    "õ": "o",
-    "Õ": "O",
+# Character latinisations are used to convert foreign letters
+# to their latinised equivalents / lookalikes.
+CHARACTER_LATINISATIONS = {
+    "æ": "ae",
+    "Æ": "Ae",
+    "Å“": "oe",
+    "Å’": "Oe",
+    "ß": "ss",
+    "ð": "d",
+    "Ð": "D",
     "ø": "o",
     "Ø": "O",
-    "Å›": "s",
-    "Åš": "S",
-    "ß": "ss",
-    "ú": "u",
-    "Ú": "U",
-    "ù": "u",
-    "Ù": "U",
-    "û": "u",
-    "Û": "U",
-    "ü": "ue",
-    "Ü": "Ue",
-    "ź": "z",
-    "Ź": "Z",
-    "ż": "z",
-    "Å»": "Z",
+    "Å‚": "l",
+    "Ł": "L",
+    "ij": "ij",
+    "IJ": "IJ",
+    "Å‹": "ng",
+    "ÅŠ": "Ng",
+    "ȶ": "t",
+    "È·": "j",
+    "ɉ": "j",
+    "É‹": "q",
+    "µ": "u",
+    "√": "v",
+    "á": "a",
+    "ą": "a",
+    "ı": "i",
 }
diff --git a/scipost_django/common/utils.py b/scipost_django/common/utils.py
index 5d3e35bd2..9e5b7fd34 100644
--- a/scipost_django/common/utils.py
+++ b/scipost_django/common/utils.py
@@ -9,17 +9,62 @@ from django.core.mail import EmailMultiAlternatives
 from django.db.models import Q
 from django.template import loader
 
-from .constants import CHARACTER_ALTERNATIVES, CHARACTER_UNACCENTED
+from .constants import CHARACTER_ALTERNATIVES, CHARACTER_LATINISATIONS
 
+import unicodedata
 
-def unaccent(text):
+
+def unaccent(text: str) -> str:
+    """
+    Remove accented characters in the given string (e.g. é -> e),
+    with the exception of the German umlauts (e.g. ö -> oe).
+    """
+    UMLAUT = "\u0308"
+
+    unaccented_text = ""
+    for char in unicodedata.normalize("NFD", text):
+        char_category = unicodedata.category(char)
+
+        if char_category != "Mn":
+            unaccented_text += char
+        elif char == UMLAUT:
+            unaccented_text += "e"
+
+    return unaccented_text
+
+
+def latinise(text: str) -> str:
     """
-    Replace accented characters by unaccented ones.
+    Convert accented characters in the given string to their
+    latinised equivalents / lookalikes (e.g. ö -> o).
     """
-    unaccented = text
-    for key, val in CHARACTER_UNACCENTED.items():
-        unaccented = unaccented.replace(key, val)
-    return unaccented
+    latinised_text = ""
+    for char in unicodedata.normalize("NFD", text):
+        char_category = unicodedata.category(char)
+
+        translated_char = char
+        is_latin = ord(char) < 128
+
+        # Keep spaces and dashes
+        if char in [" ", "-", "–"]:
+            pass
+        # Remove apostrophes and parentheses
+        elif char in ["'", "’", "(", ")"]:
+            translated_char = ""
+        # Translate only letters, symbols and punctuation
+        # skipping numbers and other characters (e.g. diacritics)
+        elif char_category[0] in ["L", "S", "P"] and not is_latin:
+            translated_char = CHARACTER_LATINISATIONS.get(char, "")
+
+        # Remove everything not in the ASCII range
+        translated_char = translated_char.encode("ascii", "ignore").decode("utf-8")
+
+        latinised_text += translated_char
+
+    # Remove multiple spaces
+    latinised_text = " ".join(latinised_text.split())
+
+    return latinised_text
 
 
 def alternative_spellings(text):
-- 
GitLab