SciPost Code Repository

Skip to content
Snippets Groups Projects
Commit 6a1952e4 authored by Jean-Sébastien Caux's avatar Jean-Sébastien Caux
Browse files

Improve markup language detection

parent 80ab9b52
No related branches found
No related tags found
No related merge requests found
...@@ -106,12 +106,48 @@ def match_rst_header(text, symbol=None): ...@@ -106,12 +106,48 @@ def match_rst_header(text, symbol=None):
return None return None
if symbol not in ReST_HEADER_REGEX_DICT.keys(): if symbol not in ReST_HEADER_REGEX_DICT.keys():
raise ValueError('symbol is not a ReST header symbol') raise ValueError('symbol is not a ReST header symbol')
print('Looking for %s in rst: %s' % (
symbol,
re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE)))
return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE) return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE)
def check_markers(markers):
"""
Checks the consistency of a markers dictionary. Returns a detector.
"""
if len(markers['rst']) > 0:
if len(markers['md']) > 0:
return {
'language': 'plain',
'errors': ('Inconsistency: Markdown and reStructuredText syntaxes are mixed:\n\n'
'Markdown: %s\n\nreStructuredText: %s' % (
markers['md'].popitem(),
markers['rst'].popitem()))
}
if len(markers['plain_or_md']) > 0:
return {
'language': 'plain',
'errors': ('Inconsistency: plain/Markdown and reStructuredText '
'syntaxes are mixed:\n\n'
'Markdown: %s\n\nreStructuredText: %s' % (
markers['plain_or_md'].popitem(),
markers['rst'].popitem()))
}
return {
'language': 'reStructuredText',
'errors': None,
}
elif len(markers['md']) > 0:
return {
'language': 'Markdown',
'errors': None,
}
return {
'language': 'plain',
'errors': None,
}
def detect_markup_language(text): def detect_markup_language(text):
""" """
Detect whether text is plain text, Markdown or reStructuredText. Detect whether text is plain text, Markdown or reStructuredText.
...@@ -163,68 +199,48 @@ def detect_markup_language(text): ...@@ -163,68 +199,48 @@ def detect_markup_language(text):
* if the ``math`` role or directive is found together with inline/displayed maths * if the ``math`` role or directive is found together with inline/displayed maths
""" """
# Start from the default assumption markers = {
detector = { 'plain_or_md': {},
'language': 'plain', 'md': {},
'errors': None 'rst': {},
} }
# Step 1: check maths # Step 1: check maths
# Inline maths is of the form $ ... $ or \( ... \) # Inline maths is of the form $ ... $ or \( ... \)
inline_math = match_inline_math(text) match = match_inline_math(text)
if match:
markers['plain_or_md']['inline_math'] = match
# Displayed maths is of the form \[ ... \] or $$ ... $$ # Displayed maths is of the form \[ ... \] or $$ ... $$
displayed_math = match_displayed_math(text) match = match_displayed_math(text)
if match:
rst_math_role = match_rst_role(text, 'math') markers['plain_or_md']['displayed_math'] = match
rst_math_directive = match_rst_directive(text, 'math')
if rst_math_role or rst_math_directive:
# reStructuredText presumed; check for errors
if inline_math:
detector['errors'] = (
'You have mixed inline maths ($ ... $ or \( ... \) ) with '
'reStructuredText markup.\n\nPlease use one or the other, but not both!')
return detector
elif displayed_math:
detector['errors'] = (
'You have mixed displayed maths ($$ ... $$ or \[ ... \]) with '
'reStructuredText markup.\n\nPlease use one or the other, but not both!')
return detector
else:
detector['language'] = 'reStructuredText'
return detector
# no rst math from here onwards match = match_rst_role(text, 'math')
if match:
markers['rst']['math_role'] = match
match = match_rst_directive(text, 'math')
if match:
markers['rst']['math_directive'] = match
# Step 2: check headers and blockquotes # Step 2: check headers and blockquotes
md_header = match_md_header(text) match = match_md_header(text)
print('md_header: %s' % md_header) if match:
md_blockquote = match_md_blockquote(text) markers['md']['header'] = match
match = match_md_blockquote(text)
rst_header = match_rst_header(text) if match:
print('rst_header: %s' % rst_header) markers['md']['blockquote'] = match
if md_header or md_blockquote:
if rst_math_role or rst_math_directive:
if md_header:
detector['errors'] = (
'You have mixed Markdown headers with reStructuredText math '
'roles/directives.\n\nPlease use one language only.')
elif md_blockquote:
detector['errors'] = (
'You have mixed Markdown blockquotes with reStructuredText math '
'roles/directives.\n\nPlease use one language only.')
detector['language'] = 'Markdown'
elif md_header or md_blockquote: match = match_rst_header(text)
detector['language'] = 'Markdown' if match:
markers['rst']['header'] = match
elif rst_header: print('markers: \n%s' % markers)
detector['language'] = 'reStructuredText'
detector = check_markers(markers)
print('detector: \n%s' % detector)
return detector return detector
...@@ -397,6 +413,10 @@ def process_markup(text, language_forced=None): ...@@ -397,6 +413,10 @@ def process_markup(text, language_forced=None):
language = language_forced if language_forced else markup_detector['language'] language = language_forced if language_forced else markup_detector['language']
markup['language'] = language markup['language'] = language
markup['errors'] = markup_detector['errors']
if markup['errors']:
return markup
if language == 'reStructuredText': if language == 'reStructuredText':
warnStream = StringIO() warnStream = StringIO()
......
...@@ -22,6 +22,7 @@ def process(request): ...@@ -22,6 +22,7 @@ def process(request):
""" """
form = MarkupTextForm(request.POST or None) form = MarkupTextForm(request.POST or None)
if form.is_valid(): if form.is_valid():
print('response: \n%s' % form.get_processed_markup())
return JsonResponse(form.get_processed_markup()) return JsonResponse(form.get_processed_markup())
return JsonResponse({}) return JsonResponse({})
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment