From c3df146ce494519008d73dc3916b8cdf2f785cae Mon Sep 17 00:00:00 2001 From: "J.-S. Caux" <J.S.Caux@uva.nl> Date: Sun, 9 Jun 2019 22:06:11 +0200 Subject: [PATCH] Partial work on detect_markup (not complete) --- markup/constants.py | 35 +++++++++ markup/utils.py | 186 +++++++++++++++++++++++++++++++++++++++++--- markup/views.py | 2 + 3 files changed, 213 insertions(+), 10 deletions(-) diff --git a/markup/constants.py b/markup/constants.py index 3a354dbf4..3cddd012d 100644 --- a/markup/constants.py +++ b/markup/constants.py @@ -2,6 +2,41 @@ __copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)" __license__ = "AGPL v3" +# Dictionary for regex expressions to recognize reStructuredText headers. +# This follows the Python conventions: order is #, *, =, -, ", ^ and +# for the first two levels (# and *), over- and underlining are necessary, while +# only underlining is needed for the lower four levels. +# The regex search should use the re.MULTILINE flag. +ReST_HEADER_REGEX_DICT = { + '#': r'^(#{1,}\n).+\n\1', # this makes use of a regex backreference + '*': r'^(\*{1,}\n).+\n\1', # this makes use of a regex backreference + '=': r'^={1,}\n', + '-': r'^-{1,}\n', + '"': r'^"{1,}\n', + '^': r'^\^{1,}\n' +} + +# See list at http://docutils.sourceforge.net/0.4/docs/ref/rst/roles.html +ReST_ROLES = [ + "math", + "emphasis", "literal", "pep-reference", "rfc-reference", + "strong", "subscript", "superscript", "title-reference" +] + +# See list of reStructuredText directives at +# http://docutils.sourceforge.net/0.4/docs/ref/rst/directives.html +ReST_DIRECTIVES = [ + "math", + "attention", "caution", "danger", "error", "hint", "important", "note", "tip", + "warning", "admonition", + "topic", "sidebar", "parsed-literal", "rubric", "epigraph", "highlights", + "pull-quote", "compound", "container", + "table", "csv-table", "list-table", + "contents", "sectnum", "section-autonumbering", "header", "footer", + "target-notes", + "replace", "unicode", "date", "class", "role", "default-role" +] + BLEACH_ALLOWED_TAGS = [ 'a', 'abbr', 'acronym', 'b', 'blockquote', 'br', 'code', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'li', 'ol', diff --git a/markup/utils.py b/markup/utils.py index f4cd117ed..e3ecc7650 100644 --- a/markup/utils.py +++ b/markup/utils.py @@ -4,36 +4,202 @@ __license__ = "AGPL v3" import re +from .constants import ReST_HEADER_REGEX_DICT, ReST_ROLES, ReST_DIRECTIVES + + +# Inline or displayed math +def match_inline_math(text): + """Return first match object of regex search for inline maths $...$ or \(...\).""" + match = re.search(r'\$[^$]+\$', text) + if match: + return match + return re.search(r'\\\(.+\\\)', text) + +def match_displayed_math(text): + """Return first match object of regex search for displayed maths $$...$$ or \[...\].""" + match = re.search(r'\$\$.+\$\$', text, re.DOTALL) + if match: + return match + return re.search(r'\\\[.+\\\]', text, re.DOTALL) + + +# Markdown +def match_md_header(text, level=None): + """ + Return first match object of regex search for Markdown headers in form #{level,}. + + If not level is given, all levels 1 to 6 are checked, returning the first match or None. + """ + if not level: + for newlevel in range(1,7): + match = match_md_header(text, newlevel) + if match: + return match + return None + if not isinstance(level, int): + raise TypeError('level must be an int') + if level < 1 or level > 6: + raise ValueError('level must be an integer from 1 to 6') + return re.search(r'^#{' + level + ',}[ ].+$', text) + +def match_md_blockquote(text): + """Return first match of regex search for Markdown blockquote.""" + return re.search(r'(^[ ]*>[ ].+){1,5}', text, re.DOTALL | re.MULTILINE) + + +# reStructuredText +def match_rst_role(text, role=None): + """ + Return first match object of regex search for given ReST role :role:`... . + + If no role is given, all roles in ReST_ROLES are tested one by one. + """ + if not role: + for newrole in ReST_ROLES: + match = match_rst_role(text, newrole) + if match: + return match + return None + if role not in ReST_ROLES: + raise ValueError('this role is not listed in ReST roles') + return re.search(r':' + role + ':`.+`', text) + +def match_rst_directive(text, directive=None): + """ + Return first match object of regex search for given ReST directive. + + If no directive is given, all directives in ReST_DIRECTIVES are tested one by one. + + The first one to three lines after the directive statement are also captured. + """ + if not directive: + for newdirective in ReST_DIRECTIVES: + match = match_rst_directive(text, newdirective) + if match: + return match + return none + if directive not in ReST_DIRECTIVES: + raise ValueError('this directive is not listed in ReST directives') + print('regex = %s' % r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}') + return re.search(r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}', text, re.MULTILINE) + +def match_rst_header(text, symbol): + """ + Return first match object of regex search for reStructuredText header. + + Python conventions are followed, namely that ``#`` and ``*`` headers have + both over and underline (of equal length, so faulty ones are not matched), + while the others (``=``, ``-``, ``"`` and ``^``) only have the underline. + """ + if symbol not in ReST_HEADER_REGEX_DICT.keys(): + raise ValueError('symbol is not a ReST header symbol') + return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE) + def detect_markup_language(text): """ - Detect which markup language is being used. + Detect whether text is plain text, Markdown or reStructuredText. This method returns a dictionary containing: - * language * errors - where ``language`` can be one of: plain, reStructuredText, Markdown + Inline and displayed maths are assumed enabled through MathJax. + For plain text and Markdown, this assumes the conventions + * inline: $ ... $ and \( ... \) + * displayed: $$ ... $$ and \[ ... \] + + while for reStructuredText, the ``math`` role and directive are used. + + We define markers, and indicator. A marker is a regex which occurs + in only one of the languages. An indicator occurs in more than one, + but not all languages. + + Language markers: + + Markdown: + * headers: [one or more #] [non-empty text] + * blockquotes: one or more lines starting with > [non-empty text] - The criteria used are: + reStructuredText: + * use of the :math: role or .. math: directive + * [two or more #][blank space][carriage return] + [text on a single line, as long as or shorter than # sequence] + [same length of #] + * same thing but for * headlines + * use of any other role + * use of any other directive - * if the ``math`` role or directive is found together with $...$, return error - * if the ``math`` role or directive is found, return ReST + Language indicators: - Assumptions: + Plain text or Markdown: + * inline or displayed maths - * MathJax is set up with $...$ for inline, \[...\] for online equations. + Markdown or reStructuredText: + * [=]+ alone on a line <- users discouraged to use this in Markdown + * [-]+ alone on a line <- users discouraged to use this in Markdown + + Exclusions (sources of errors): + * inline or displayed maths cannot be used in ReST + + The criteria used for error reporting are: + + * if the ``math`` role or directive is found together with inline/displayed maths """ + # Start from the default assumption + detector = { + 'language': 'plain', + 'errors': None + } + + # Inline maths is of the form $ ... $ or \( ... \) + inline_math = match_inline_math(text) + + # Displayed maths is of the form \[ ... \] or $$ ... $$ + displayed_math = match_displayed_math(text) + + rst_math_role = match_rst_role(text, 'math') + rst_math_directive = match_rst_directive(text, 'math') + + md_header = match_md_header(text) + md_blockquote = match_md_blockquote(text) + + if rst_math_role or rst_math_directive: + if inline_math: + detector['errors'] = ( + 'You have mixed inline maths ($ ... $ or \( ... \) ) with ' + 'reStructuredText markup.\n\nPlease use one or the other, but not both!') + return detector + elif displayed_math: + detector['errors'] = ( + 'You have mixed displayed maths ($$ ... $$ or \[ ... \]) with ' + 'reStructuredText markup.\n\nPlease use one or the other, but not both!') + return detector + elif md_header: + detector['errors'] = ( + 'You have mixed Markdown headers with reStructuredText math roles/directives.' + '\n\nPlease use one language only.') + elif md_blockquote: + detector['errors'] = ( + 'You have mixed Markdown blockquotes with reStructuredText math roles/directives.' + '\n\nPlease use one language only.') + else: + detector['language'] = 'reStructuredText' + + return detector + + + +def detect_markup_language_old(text): # Inline maths - inline_math = re.search("\$[^$]+\$", text) + inline_math = match_inline_math(text) # if inline_math: # print('inline math: %s' % inline_math.group(0)) # Online maths is of the form \[ ... \] # The re.DOTALL is to also capture newline chars with the . (any single character) - online_math = re.search(r'[\\][[].+[\\][\]]', text, re.DOTALL) + online_math = match_displayed_math(text) # if online_math: # print('online math: %s' % online_math.group(0)) diff --git a/markup/views.py b/markup/views.py index 507e1adbd..ea4fea602 100644 --- a/markup/views.py +++ b/markup/views.py @@ -2,6 +2,7 @@ __copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)" __license__ = "AGPL v3" +from django.contrib.auth.decorators import login_required from django.http import JsonResponse from django.shortcuts import render @@ -10,6 +11,7 @@ from .constants import MathSnippets, PlainTextSnippets,\ from .forms import MarkupTextForm +@login_required def process(request): """ API call to process the POSTed text. -- GitLab