From 6a1952e44cef4e3ec40d3b4509f20d8844cfb1f2 Mon Sep 17 00:00:00 2001 From: "J.-S. Caux" <J.S.Caux@uva.nl> Date: Thu, 13 Jun 2019 09:19:50 +0200 Subject: [PATCH] Improve markup language detection --- markup/utils.py | 122 ++++++++++++++++++++++++++++-------------------- markup/views.py | 1 + 2 files changed, 72 insertions(+), 51 deletions(-) diff --git a/markup/utils.py b/markup/utils.py index 3e0487510..d19851c0e 100644 --- a/markup/utils.py +++ b/markup/utils.py @@ -106,12 +106,48 @@ def match_rst_header(text, symbol=None): return None if symbol not in ReST_HEADER_REGEX_DICT.keys(): raise ValueError('symbol is not a ReST header symbol') - print('Looking for %s in rst: %s' % ( - symbol, - re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE))) return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE) +def check_markers(markers): + """ + Checks the consistency of a markers dictionary. Returns a detector. + """ + if len(markers['rst']) > 0: + if len(markers['md']) > 0: + return { + 'language': 'plain', + 'errors': ('Inconsistency: Markdown and reStructuredText syntaxes are mixed:\n\n' + 'Markdown: %s\n\nreStructuredText: %s' % ( + markers['md'].popitem(), + markers['rst'].popitem())) + } + if len(markers['plain_or_md']) > 0: + return { + 'language': 'plain', + 'errors': ('Inconsistency: plain/Markdown and reStructuredText ' + 'syntaxes are mixed:\n\n' + 'Markdown: %s\n\nreStructuredText: %s' % ( + markers['plain_or_md'].popitem(), + markers['rst'].popitem())) + } + return { + 'language': 'reStructuredText', + 'errors': None, + } + + elif len(markers['md']) > 0: + return { + 'language': 'Markdown', + 'errors': None, + } + + return { + 'language': 'plain', + 'errors': None, + } + + def detect_markup_language(text): """ Detect whether text is plain text, Markdown or reStructuredText. @@ -163,68 +199,48 @@ def detect_markup_language(text): * if the ``math`` role or directive is found together with inline/displayed maths """ - # Start from the default assumption - detector = { - 'language': 'plain', - 'errors': None + markers = { + 'plain_or_md': {}, + 'md': {}, + 'rst': {}, } # Step 1: check maths # Inline maths is of the form $ ... $ or \( ... \) - inline_math = match_inline_math(text) + match = match_inline_math(text) + if match: + markers['plain_or_md']['inline_math'] = match # Displayed maths is of the form \[ ... \] or $$ ... $$ - displayed_math = match_displayed_math(text) - - rst_math_role = match_rst_role(text, 'math') - rst_math_directive = match_rst_directive(text, 'math') - - if rst_math_role or rst_math_directive: - # reStructuredText presumed; check for errors - if inline_math: - detector['errors'] = ( - 'You have mixed inline maths ($ ... $ or \( ... \) ) with ' - 'reStructuredText markup.\n\nPlease use one or the other, but not both!') - return detector - elif displayed_math: - detector['errors'] = ( - 'You have mixed displayed maths ($$ ... $$ or \[ ... \]) with ' - 'reStructuredText markup.\n\nPlease use one or the other, but not both!') - return detector - else: - detector['language'] = 'reStructuredText' - return detector + match = match_displayed_math(text) + if match: + markers['plain_or_md']['displayed_math'] = match - # no rst math from here onwards + match = match_rst_role(text, 'math') + if match: + markers['rst']['math_role'] = match + match = match_rst_directive(text, 'math') + if match: + markers['rst']['math_directive'] = match # Step 2: check headers and blockquotes - md_header = match_md_header(text) - print('md_header: %s' % md_header) - md_blockquote = match_md_blockquote(text) - - rst_header = match_rst_header(text) - print('rst_header: %s' % rst_header) - - if md_header or md_blockquote: - if rst_math_role or rst_math_directive: - if md_header: - detector['errors'] = ( - 'You have mixed Markdown headers with reStructuredText math ' - 'roles/directives.\n\nPlease use one language only.') - elif md_blockquote: - detector['errors'] = ( - 'You have mixed Markdown blockquotes with reStructuredText math ' - 'roles/directives.\n\nPlease use one language only.') - detector['language'] = 'Markdown' + match = match_md_header(text) + if match: + markers['md']['header'] = match + match = match_md_blockquote(text) + if match: + markers['md']['blockquote'] = match - elif md_header or md_blockquote: - detector['language'] = 'Markdown' + match = match_rst_header(text) + if match: + markers['rst']['header'] = match - elif rst_header: - detector['language'] = 'reStructuredText' + print('markers: \n%s' % markers) + detector = check_markers(markers) + print('detector: \n%s' % detector) return detector @@ -397,6 +413,10 @@ def process_markup(text, language_forced=None): language = language_forced if language_forced else markup_detector['language'] markup['language'] = language + markup['errors'] = markup_detector['errors'] + + if markup['errors']: + return markup if language == 'reStructuredText': warnStream = StringIO() diff --git a/markup/views.py b/markup/views.py index b5e254e44..b6f1f5d67 100644 --- a/markup/views.py +++ b/markup/views.py @@ -22,6 +22,7 @@ def process(request): """ form = MarkupTextForm(request.POST or None) if form.is_valid(): + print('response: \n%s' % form.get_processed_markup()) return JsonResponse(form.get_processed_markup()) return JsonResponse({}) -- GitLab