From 2caaf863a26661cf2aa4e9d54fc074242855ec5e Mon Sep 17 00:00:00 2001
From: George Katsikas <giorgakis.katsikas@gmail.com>
Date: Wed, 5 Jul 2023 14:53:04 +0200
Subject: [PATCH] refactor unaccent utility add latinisation utility

---
 scipost_django/common/constants.py | 91 ++++++++----------------------
 scipost_django/common/utils.py     | 59 ++++++++++++++++---
 2 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/scipost_django/common/constants.py b/scipost_django/common/constants.py
index 31499ef62..e925a1d65 100644
--- a/scipost_django/common/constants.py
+++ b/scipost_django/common/constants.py
@@ -17,72 +17,31 @@ CHARACTER_ALTERNATIVES = {
     "Ãœ": "Ue",
 }
 
-CHARACTER_UNACCENTED = {
-    "Ã ": "a",
-    "Ã€": "A",
-    "Ã¡": "a",
-    "Ã": "A",
-    "Ã¢": "a",
-    "Ã‚": "A",
-    "Ã¤": "ae",
-    "Ã„": "Ae",
-    "Ã£": "a",
-    "Ãƒ": "A",
-    "Ã¥": "a",
-    "Ã…": "A",
-    "Ã§": "c",
-    "Ã‡": "C",
-    "Ä‡": "c",
-    "Ä†": "c",
-    "Ã©": "e",
-    "Ã‰": "E",
-    "Ã¨": "e",
-    "Ãˆ": "E",
-    "Ãª": "e",
-    "ÃŠ": "E",
-    "Ã«": "e",
-    "Ã‹": "E",
-    "Ä™": "e",
-    "Ä˜": "E",
-    "Ã": "i",
-    "Ã": "I",
-    "Ã¬": "i",
-    "ÃŒ": "I",
-    "Ã®": "i",
-    "ÃŽ": "I",
-    "Ã¯": "i",
-    "Ã": "I",
-    "Å‚": "l",
-    "Å": "L",
-    "Ã±": "n",
-    "Ã‘": "N",
-    "Å„": "n",
-    "Åƒ": "N",
-    "Ã³": "o",
-    "Ã“": "O",
-    "Ã²": "o",
-    "Ã’": "O",
-    "Ã´": "o",
-    "Ã”": "O",
-    "Ã¶": "oe",
-    "Ã–": "Oe",
-    "Ãµ": "o",
-    "Ã•": "O",
+# Character latinisations are used to convert foreign letters
+# to their latinised equivalents / lookalikes.
+CHARACTER_LATINISATIONS = {
+    "Ã¦": "ae",
+    "Ã†": "Ae",
+    "Å“": "oe",
+    "Å’": "Oe",
+    "ÃŸ": "ss",
+    "Ã°": "d",
+    "Ã": "D",
     "Ã¸": "o",
     "Ã˜": "O",
-    "Å›": "s",
-    "Åš": "S",
-    "ÃŸ": "ss",
-    "Ãº": "u",
-    "Ãš": "U",
-    "Ã¹": "u",
-    "Ã™": "U",
-    "Ã»": "u",
-    "Ã›": "U",
-    "Ã¼": "ue",
-    "Ãœ": "Ue",
-    "Åº": "z",
-    "Å¹": "Z",
-    "Å¼": "z",
-    "Å»": "Z",
+    "Å‚": "l",
+    "Å": "L",
+    "Ä³": "ij",
+    "Ä²": "IJ",
+    "Å‹": "ng",
+    "ÅŠ": "Ng",
+    "È¶": "t",
+    "È·": "j",
+    "É‰": "j",
+    "É‹": "q",
+    "Âµ": "u",
+    "âˆš": "v",
+    "aÌ": "a",
+    "aÌ¨": "a",
+    "Ä±": "i",
 }
diff --git a/scipost_django/common/utils.py b/scipost_django/common/utils.py
index 5d3e35bd2..9e5b7fd34 100644
--- a/scipost_django/common/utils.py
+++ b/scipost_django/common/utils.py
@@ -9,17 +9,62 @@ from django.core.mail import EmailMultiAlternatives
 from django.db.models import Q
 from django.template import loader
 
-from .constants import CHARACTER_ALTERNATIVES, CHARACTER_UNACCENTED
+from .constants import CHARACTER_ALTERNATIVES, CHARACTER_LATINISATIONS
 
+import unicodedata
 
-def unaccent(text):
+
+def unaccent(text: str) -> str:
+    """
+    Remove accented characters in the given string (e.g. Ã© -> e),
+    with the exception of the German umlauts (e.g. Ã¶ -> oe).
+    """
+    UMLAUT = "\u0308"
+
+    unaccented_text = ""
+    for char in unicodedata.normalize("NFD", text):
+        char_category = unicodedata.category(char)
+
+        if char_category != "Mn":
+            unaccented_text += char
+        elif char == UMLAUT:
+            unaccented_text += "e"
+
+    return unaccented_text
+
+
+def latinise(text: str) -> str:
     """
-    Replace accented characters by unaccented ones.
+    Convert accented characters in the given string to their
+    latinised equivalents / lookalikes (e.g. Ã¶ -> o).
     """
-    unaccented = text
-    for key, val in CHARACTER_UNACCENTED.items():
-        unaccented = unaccented.replace(key, val)
-    return unaccented
+    latinised_text = ""
+    for char in unicodedata.normalize("NFD", text):
+        char_category = unicodedata.category(char)
+
+        translated_char = char
+        is_latin = ord(char) < 128
+
+        # Keep spaces and dashes
+        if char in [" ", "-", "â€“"]:
+            pass
+        # Remove apostrophes and parentheses
+        elif char in ["'", "â€™", "(", ")"]:
+            translated_char = ""
+        # Translate only letters, symbols and punctuation
+        # skipping numbers and other characters (e.g. diacritics)
+        elif char_category[0] in ["L", "S", "P"] and not is_latin:
+            translated_char = CHARACTER_LATINISATIONS.get(char, "")
+
+        # Remove everything not in the ASCII range
+        translated_char = translated_char.encode("ascii", "ignore").decode("utf-8")
+
+        latinised_text += translated_char
+
+    # Remove multiple spaces
+    latinised_text = " ".join(latinised_text.split())
+
+    return latinised_text
 
 
 def alternative_spellings(text):
-- 
GitLab