From c3df146ce494519008d73dc3916b8cdf2f785cae Mon Sep 17 00:00:00 2001
From: "J.-S. Caux" <J.S.Caux@uva.nl>
Date: Sun, 9 Jun 2019 22:06:11 +0200
Subject: [PATCH] Partial work on detect_markup (not complete)

---
 markup/constants.py |  35 +++++++++
 markup/utils.py     | 186 +++++++++++++++++++++++++++++++++++++++++---
 markup/views.py     |   2 +
 3 files changed, 213 insertions(+), 10 deletions(-)

diff --git a/markup/constants.py b/markup/constants.py
index 3a354dbf4..3cddd012d 100644
--- a/markup/constants.py
+++ b/markup/constants.py
@@ -2,6 +2,41 @@ __copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)"
 __license__ = "AGPL v3"
 
 
+# Dictionary for regex expressions to recognize reStructuredText headers.
+# This follows the Python conventions: order is #, *, =, -, ", ^ and
+# for the first two levels (# and *), over- and underlining are necessary, while
+# only underlining is needed for the lower four levels.
+# The regex search should use the re.MULTILINE flag.
+ReST_HEADER_REGEX_DICT = {
+    '#': r'^(#{1,}\n).+\n\1', # this makes use of a regex backreference
+    '*': r'^(\*{1,}\n).+\n\1', # this makes use of a regex backreference
+    '=': r'^={1,}\n',
+    '-': r'^-{1,}\n',
+    '"': r'^"{1,}\n',
+    '^': r'^\^{1,}\n'
+}
+
+# See list at http://docutils.sourceforge.net/0.4/docs/ref/rst/roles.html
+ReST_ROLES = [
+    "math",
+    "emphasis", "literal", "pep-reference", "rfc-reference",
+    "strong", "subscript", "superscript", "title-reference"
+]
+
+# See list of reStructuredText directives at
+# http://docutils.sourceforge.net/0.4/docs/ref/rst/directives.html
+ReST_DIRECTIVES = [
+    "math",
+    "attention", "caution", "danger", "error", "hint", "important", "note", "tip",
+    "warning", "admonition",
+    "topic", "sidebar", "parsed-literal", "rubric", "epigraph", "highlights",
+    "pull-quote", "compound", "container",
+    "table", "csv-table", "list-table",
+    "contents", "sectnum", "section-autonumbering", "header", "footer",
+    "target-notes",
+    "replace", "unicode", "date", "class", "role", "default-role"
+]
+
 BLEACH_ALLOWED_TAGS = [
     'a', 'abbr', 'acronym', 'b', 'blockquote', 'br', 'code', 'em',
     'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'li', 'ol',
diff --git a/markup/utils.py b/markup/utils.py
index f4cd117ed..e3ecc7650 100644
--- a/markup/utils.py
+++ b/markup/utils.py
@@ -4,36 +4,202 @@ __license__ = "AGPL v3"
 
 import re
 
+from .constants import ReST_HEADER_REGEX_DICT, ReST_ROLES, ReST_DIRECTIVES
+
+
+# Inline or displayed math
+def match_inline_math(text):
+    """Return first match object of regex search for inline maths $...$ or \(...\)."""
+    match = re.search(r'\$[^$]+\$', text)
+    if match:
+        return match
+    return re.search(r'\\\(.+\\\)', text)
+
+def match_displayed_math(text):
+    """Return first match object of regex search for displayed maths $$...$$ or \[...\]."""
+    match = re.search(r'\$\$.+\$\$', text, re.DOTALL)
+    if match:
+        return match
+    return re.search(r'\\\[.+\\\]', text, re.DOTALL)
+
+
+# Markdown
+def match_md_header(text, level=None):
+    """
+    Return first match object of regex search for Markdown headers in form #{level,}.
+
+    If not level is given, all levels 1 to 6 are checked, returning the first match or None.
+    """
+    if not level:
+        for newlevel in range(1,7):
+            match = match_md_header(text, newlevel)
+            if match:
+                return match
+        return None
+    if not isinstance(level, int):
+        raise TypeError('level must be an int')
+    if level < 1 or level > 6:
+        raise ValueError('level must be an integer from 1 to 6')
+    return re.search(r'^#{' + level + ',}[ ].+$', text)
+
+def match_md_blockquote(text):
+    """Return first match of regex search for Markdown blockquote."""
+    return re.search(r'(^[ ]*>[ ].+){1,5}', text, re.DOTALL | re.MULTILINE)
+
+
+# reStructuredText
+def match_rst_role(text, role=None):
+    """
+    Return first match object of regex search for given ReST role :role:`... .
+
+    If no role is given, all roles in ReST_ROLES are tested one by one.
+    """
+    if not role:
+        for newrole in ReST_ROLES:
+            match = match_rst_role(text, newrole)
+            if match:
+                return match
+        return None
+    if role not in ReST_ROLES:
+        raise ValueError('this role is not listed in ReST roles')
+    return re.search(r':' + role + ':`.+`', text)
+
+def match_rst_directive(text, directive=None):
+    """
+    Return first match object of regex search for given ReST directive.
+
+    If no directive is given, all directives in ReST_DIRECTIVES are tested one by one.
+
+    The first one to three lines after the directive statement are also captured.
+    """
+    if not directive:
+        for newdirective in ReST_DIRECTIVES:
+            match = match_rst_directive(text, newdirective)
+            if match:
+                return match
+        return none
+    if directive not in ReST_DIRECTIVES:
+        raise ValueError('this directive is not listed in ReST directives')
+    print('regex = %s' % r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}')
+    return re.search(r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}', text, re.MULTILINE)
+
+def match_rst_header(text, symbol):
+    """
+    Return first match object of regex search for reStructuredText header.
+
+    Python conventions are followed, namely that ``#`` and ``*`` headers have
+    both over and underline (of equal length, so faulty ones are not matched),
+    while the others (``=``, ``-``, ``"`` and ``^``) only have the underline.
+    """
+    if symbol not in ReST_HEADER_REGEX_DICT.keys():
+        raise ValueError('symbol is not a ReST header symbol')
+    return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE)
+
 
 def detect_markup_language(text):
     """
-    Detect which markup language is being used.
+    Detect whether text is plain text, Markdown or reStructuredText.
 
     This method returns a dictionary containing:
-
     * language
     * errors
 
-    where ``language`` can be one of: plain, reStructuredText, Markdown
+    Inline and displayed maths are assumed enabled through MathJax.
+    For plain text and Markdown, this assumes the conventions
+    * inline: $ ... $ and \( ... \)
+    * displayed: $$ ... $$ and \[ ... \]
+
+    while for reStructuredText, the ``math`` role and directive are used.
+
+    We define markers, and indicator. A marker is a regex which occurs
+    in only one of the languages. An indicator occurs in more than one,
+    but not all languages.
+
+    Language markers:
+
+    Markdown:
+    * headers: [one or more #] [non-empty text]
+    * blockquotes: one or more lines starting with > [non-empty text]
 
-    The criteria used are:
+    reStructuredText:
+    * use of the :math: role or .. math: directive
+    * [two or more #][blank space][carriage return]
+      [text on a single line, as long as or shorter than # sequence]
+      [same length of #]
+    * same thing but for * headlines
+    * use of any other role
+    * use of any other directive
 
-    * if the ``math`` role or directive is found together with $...$, return error
-    * if the ``math`` role or directive is found, return ReST
+    Language indicators:
 
-    Assumptions:
+    Plain text or Markdown:
+    * inline or displayed maths
 
-    * MathJax is set up with $...$ for inline, \[...\] for online equations.
+    Markdown or reStructuredText:
+    * [=]+ alone on a line  <- users discouraged to use this in Markdown
+    * [-]+ alone on a line  <- users discouraged to use this in Markdown
+
+    Exclusions (sources of errors):
+    * inline or displayed maths cannot be used in ReST
+
+    The criteria used for error reporting are:
+
+    * if the ``math`` role or directive is found together with inline/displayed maths
     """
 
+    # Start from the default assumption
+    detector = {
+        'language': 'plain',
+        'errors': None
+    }
+
+    # Inline maths is of the form $ ... $ or \( ... \)
+    inline_math = match_inline_math(text)
+
+    # Displayed maths is of the form \[ ... \] or $$ ... $$
+    displayed_math = match_displayed_math(text)
+
+    rst_math_role = match_rst_role(text, 'math')
+    rst_math_directive = match_rst_directive(text, 'math')
+
+    md_header = match_md_header(text)
+    md_blockquote = match_md_blockquote(text)
+
+    if rst_math_role or rst_math_directive:
+        if inline_math:
+            detector['errors'] = (
+                'You have mixed inline maths ($ ... $ or \( ... \) ) with '
+                'reStructuredText markup.\n\nPlease use one or the other, but not both!')
+            return detector
+        elif displayed_math:
+            detector['errors'] = (
+                'You have mixed displayed maths ($$ ... $$ or \[ ... \]) with '
+                'reStructuredText markup.\n\nPlease use one or the other, but not both!')
+            return detector
+        elif md_header:
+            detector['errors'] = (
+                'You have mixed Markdown headers with reStructuredText math roles/directives.'
+                '\n\nPlease use one language only.')
+        elif md_blockquote:
+            detector['errors'] = (
+                'You have mixed Markdown blockquotes with reStructuredText math roles/directives.'
+                '\n\nPlease use one language only.')
+        else:
+            detector['language'] = 'reStructuredText'
+
+    return detector
+
+
+
+def detect_markup_language_old(text):
     # Inline maths
-    inline_math = re.search("\$[^$]+\$", text)
+    inline_math = match_inline_math(text)
     # if inline_math:
     #     print('inline math: %s' % inline_math.group(0))
 
     # Online maths is of the form \[ ... \]
     # The re.DOTALL is to also capture newline chars with the . (any single character)
-    online_math = re.search(r'[\\][[].+[\\][\]]', text, re.DOTALL)
+    online_math = match_displayed_math(text)
     # if online_math:
     #     print('online math: %s' % online_math.group(0))
 
diff --git a/markup/views.py b/markup/views.py
index 507e1adbd..ea4fea602 100644
--- a/markup/views.py
+++ b/markup/views.py
@@ -2,6 +2,7 @@ __copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)"
 __license__ = "AGPL v3"
 
 
+from django.contrib.auth.decorators import login_required
 from django.http import JsonResponse
 from django.shortcuts import render
 
@@ -10,6 +11,7 @@ from .constants import MathSnippets, PlainTextSnippets,\
 from .forms import MarkupTextForm
 
 
+@login_required
 def process(request):
     """
     API call to process the POSTed text.
-- 
GitLab