From 1bfa5a1bbd91fd518ca725d760e7568b5b6faae6 Mon Sep 17 00:00:00 2001 From: George Katsikas <giorgakis.katsikas@gmail.com> Date: Thu, 30 Nov 2023 18:51:32 +0100 Subject: [PATCH] add crossref metadata XML validation through XSD --- requirements.txt | 1 + scipost_django/SciPost_v1/settings/base.py | 1 + scipost_django/journals/forms.py | 59 ++++++++++++++++++- .../journals/create_metadata_xml.html | 34 ++++++++--- scipost_django/journals/views.py | 8 +++ 5 files changed, 95 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 351d26ebd..c970f6309 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,6 +62,7 @@ ithenticate-api-python==0.8 python-dateutil==2.8.2 # 2023-02-05 Pillow==9.4.0 # 2023-02-05 html2text +lxml==4.9.3 # 2023-11-29 Markdown==3.1.1 diff --git a/scipost_django/SciPost_v1/settings/base.py b/scipost_django/SciPost_v1/settings/base.py index ec347f304..52855255f 100644 --- a/scipost_django/SciPost_v1/settings/base.py +++ b/scipost_django/SciPost_v1/settings/base.py @@ -435,6 +435,7 @@ CROSSREF_LOGIN_ID = "" CROSSREF_LOGIN_PASSWORD = "" CROSSREF_DEBUG = True CROSSREF_DEPOSIT_EMAIL = "techsupport@scipost.org" +CROSSREF_SCHEMA_FILE = "crossref/schemas/crossref5.3.1.xsd" DOAJ_API_KEY = "" # Google reCaptcha with Google's global test keys diff --git a/scipost_django/journals/forms.py b/scipost_django/journals/forms.py index d0f1f70d8..e30c4e9f2 100644 --- a/scipost_django/journals/forms.py +++ b/scipost_django/journals/forms.py @@ -17,6 +17,9 @@ from django.forms import BaseModelFormSet, modelformset_factory from django.template import loader from django.utils import timezone +import lxml.etree as ET +from html.entities import entitydefs + from crispy_forms.helper import FormHelper from crispy_forms.layout import Layout, Div, Field, ButtonHolder, Submit from crispy_bootstrap5.bootstrap5 import FloatingField @@ -240,14 +243,39 @@ class AuthorsTableOrganizationSelectForm(forms.ModelForm): class CreateMetadataXMLForm(forms.ModelForm): + # schema = ET.XMLSchema(file=static(settings.CROSSREF_SCHEMA_FILE)) + schema = ET.XMLSchema(file=settings.STATIC_ROOT + settings.CROSSREF_SCHEMA_FILE) + parser = ET.XMLParser(schema=schema) + class Meta: model = Publication fields = ["metadata_xml"] def __init__(self, *args, **kwargs): - kwargs["initial"] = {"metadata_xml": self.new_xml(kwargs.get("instance"))} + xml = self.new_xml(kwargs.get("instance")) + self.xml_str = self.format_xml(self.decode_html_entities(xml)) + kwargs["initial"] = {"metadata_xml": self.xml_str} + super().__init__(*args, **kwargs) + @staticmethod + def decode_html_entities(xml: str): + # Replace any encoded HTML entities with their decoded counterparts + for entity, symbol in entitydefs.items(): + if entity in ["lt", "gt", "amp", "quot", "apos"]: + continue + + xml = xml.replace(f"&{entity};", symbol) + + return xml + + def clean_metadata_xml(self): + # Flatten the XML before saving + xml = self.cleaned_data["metadata_xml"] + xml = re.sub(r"\s*\n+\s*", "", xml, flags=re.MULTILINE) + + return xml + def save(self, *args, **kwargs): self.instance.latest_metadata_update = timezone.now() return super().save(*args, **kwargs) @@ -281,6 +309,35 @@ class CreateMetadataXMLForm(forms.ModelForm): } return template.render(context) + def format_xml(self, xml_str: str) -> str: + """ + Format XML by pretty printing it. + Returns the formatted XML as a string. + """ + # Try to parse the XML, if it fails, just return the string + try: + xml = ET.fromstring(bytes(xml_str, encoding="utf8")) + xml_str = ET.tostring(xml, pretty_print=True).decode("utf8") + except: + pass + + return xml_str + + def validate_xml(self, xml_str: str): + """ + Validate XML by running it through the schema. + Returns a tuple of (valid, errors, xml_str). + """ + # Try to parse the XML, if it fails, just return the string + try: + xml_str = self.format_xml(xml_str) + xml = ET.fromstring(bytes(xml_str, encoding="utf8")) + valid = self.schema.validate(xml) + errors = list(self.schema.error_log) + return valid, errors, xml_str + except ET.XMLSyntaxError as error: + return False, [str(error)], xml_str + class CreateMetadataDOAJForm(forms.ModelForm): class Meta: diff --git a/scipost_django/journals/templates/journals/create_metadata_xml.html b/scipost_django/journals/templates/journals/create_metadata_xml.html index 8f4ac0015..dc21fce32 100644 --- a/scipost_django/journals/templates/journals/create_metadata_xml.html +++ b/scipost_django/journals/templates/journals/create_metadata_xml.html @@ -1,6 +1,8 @@ {% extends 'scipost/base.html' %} -{% block pagetitle %}: Create metadata xml{% endblock pagetitle %} +{% block pagetitle %} + : Create metadata xml +{% endblock pagetitle %} {% block breadcrumb %} <div class="breadcrumb-container"> @@ -22,20 +24,38 @@ <div class="col-12"> <h1 class="highlight">Create metadata XML (for Crossref deposit)</h1> <p> - The following field is prefilled with data from the Publication object. Once you accept them, they will overwrite the current metadata, shown below. + The following field is prefilled with data from the Publication object, formatted for readability. Once you accept them, they will be flattened to a single line and overwrite the current metadata shown below. </p> - <br> - <form action="{% url 'journals:create_metadata_xml' publication.doi_label %}" method="post"> + <br /> + <form action="{% url 'journals:create_metadata_xml' publication.doi_label %}" + method="post"> {% csrf_token %} {{ form|bootstrap }} - <input type="submit" class="btn btn-primary" value="Accept the metadata"> + + {% if errors %} + <div class="bg-danger bg-opacity-10 p-3 mb-2"> + <span class="fw-bold me-2">XML validation result:</span> + <ul class="mb-0"> + + {% for error in errors %}<li>{{ error }}</li>{% endfor %} + + </ul> + </div> + {% else %} + <div class="bg-success bg-opacity-10 p-3 mb-2"> + <span class="fw-bold me-2">XML validation result:</span> + Valid + </div> + {% endif %} + + <input type="submit" class="btn btn-primary" value="Accept the metadata" /> <a href="{% url 'journals:manage_metadata' %}" class="ms-3 btn btn-link">Back to Admin</a> </form> - <hr class="divider"> + <hr class="divider" /> <h3>Current metadata xml</h3> - <br> + <br /> <pre><code>{{ publication.metadata_xml|linebreaksbr }}</code></pre> </div> </div> diff --git a/scipost_django/journals/views.py b/scipost_django/journals/views.py index d39a12839..c654e1eb7 100644 --- a/scipost_django/journals/views.py +++ b/scipost_django/journals/views.py @@ -9,6 +9,7 @@ import os import random import string import shutil +from typing import Any, Dict import requests import matplotlib @@ -1068,6 +1069,13 @@ class CreateMetadataXMLView( form_class = CreateMetadataXMLForm template_name = "journals/create_metadata_xml.html" + def get_context_data(self, **kwargs: Any) -> Dict[str, Any]: + context = super().get_context_data(**kwargs) + form = context["form"] + valid, errors, xml_str = form.validate_xml(form.xml_str) + + return {**context, "valid": valid, "errors": errors, "xml_str": xml_str} + @permission_required("scipost.can_draft_publication", return_403=True) @transaction.atomic -- GitLab