From 1bfa5a1bbd91fd518ca725d760e7568b5b6faae6 Mon Sep 17 00:00:00 2001
From: George Katsikas <giorgakis.katsikas@gmail.com>
Date: Thu, 30 Nov 2023 18:51:32 +0100
Subject: [PATCH] add crossref metadata XML validation through XSD

---
 requirements.txt                              |  1 +
 scipost_django/SciPost_v1/settings/base.py    |  1 +
 scipost_django/journals/forms.py              | 59 ++++++++++++++++++-
 .../journals/create_metadata_xml.html         | 34 ++++++++---
 scipost_django/journals/views.py              |  8 +++
 5 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 351d26ebd..c970f6309 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,6 +62,7 @@ ithenticate-api-python==0.8
 python-dateutil==2.8.2		# 2023-02-05
 Pillow==9.4.0			# 2023-02-05
 html2text
+lxml==4.9.3             # 2023-11-29 
 
 
 Markdown==3.1.1
diff --git a/scipost_django/SciPost_v1/settings/base.py b/scipost_django/SciPost_v1/settings/base.py
index ec347f304..52855255f 100644
--- a/scipost_django/SciPost_v1/settings/base.py
+++ b/scipost_django/SciPost_v1/settings/base.py
@@ -435,6 +435,7 @@ CROSSREF_LOGIN_ID = ""
 CROSSREF_LOGIN_PASSWORD = ""
 CROSSREF_DEBUG = True
 CROSSREF_DEPOSIT_EMAIL = "techsupport@scipost.org"
+CROSSREF_SCHEMA_FILE = "crossref/schemas/crossref5.3.1.xsd"
 DOAJ_API_KEY = ""
 
 # Google reCaptcha with Google's global test keys
diff --git a/scipost_django/journals/forms.py b/scipost_django/journals/forms.py
index d0f1f70d8..e30c4e9f2 100644
--- a/scipost_django/journals/forms.py
+++ b/scipost_django/journals/forms.py
@@ -17,6 +17,9 @@ from django.forms import BaseModelFormSet, modelformset_factory
 from django.template import loader
 from django.utils import timezone
 
+import lxml.etree as ET
+from html.entities import entitydefs
+
 from crispy_forms.helper import FormHelper
 from crispy_forms.layout import Layout, Div, Field, ButtonHolder, Submit
 from crispy_bootstrap5.bootstrap5 import FloatingField
@@ -240,14 +243,39 @@ class AuthorsTableOrganizationSelectForm(forms.ModelForm):
 
 
 class CreateMetadataXMLForm(forms.ModelForm):
+    # schema = ET.XMLSchema(file=static(settings.CROSSREF_SCHEMA_FILE))
+    schema = ET.XMLSchema(file=settings.STATIC_ROOT + settings.CROSSREF_SCHEMA_FILE)
+    parser = ET.XMLParser(schema=schema)
+
     class Meta:
         model = Publication
         fields = ["metadata_xml"]
 
     def __init__(self, *args, **kwargs):
-        kwargs["initial"] = {"metadata_xml": self.new_xml(kwargs.get("instance"))}
+        xml = self.new_xml(kwargs.get("instance"))
+        self.xml_str = self.format_xml(self.decode_html_entities(xml))
+        kwargs["initial"] = {"metadata_xml": self.xml_str}
+
         super().__init__(*args, **kwargs)
 
+    @staticmethod
+    def decode_html_entities(xml: str):
+        # Replace any encoded HTML entities with their decoded counterparts
+        for entity, symbol in entitydefs.items():
+            if entity in ["lt", "gt", "amp", "quot", "apos"]:
+                continue
+
+            xml = xml.replace(f"&{entity};", symbol)
+
+        return xml
+
+    def clean_metadata_xml(self):
+        # Flatten the XML before saving
+        xml = self.cleaned_data["metadata_xml"]
+        xml = re.sub(r"\s*\n+\s*", "", xml, flags=re.MULTILINE)
+
+        return xml
+
     def save(self, *args, **kwargs):
         self.instance.latest_metadata_update = timezone.now()
         return super().save(*args, **kwargs)
@@ -281,6 +309,35 @@ class CreateMetadataXMLForm(forms.ModelForm):
         }
         return template.render(context)
 
+    def format_xml(self, xml_str: str) -> str:
+        """
+        Format XML by pretty printing it.
+        Returns the formatted XML as a string.
+        """
+        # Try to parse the XML, if it fails, just return the string
+        try:
+            xml = ET.fromstring(bytes(xml_str, encoding="utf8"))
+            xml_str = ET.tostring(xml, pretty_print=True).decode("utf8")
+        except:
+            pass
+
+        return xml_str
+
+    def validate_xml(self, xml_str: str):
+        """
+        Validate XML by running it through the schema.
+        Returns a tuple of (valid, errors, xml_str).
+        """
+        # Try to parse the XML, if it fails, just return the string
+        try:
+            xml_str = self.format_xml(xml_str)
+            xml = ET.fromstring(bytes(xml_str, encoding="utf8"))
+            valid = self.schema.validate(xml)
+            errors = list(self.schema.error_log)
+            return valid, errors, xml_str
+        except ET.XMLSyntaxError as error:
+            return False, [str(error)], xml_str
+
 
 class CreateMetadataDOAJForm(forms.ModelForm):
     class Meta:
diff --git a/scipost_django/journals/templates/journals/create_metadata_xml.html b/scipost_django/journals/templates/journals/create_metadata_xml.html
index 8f4ac0015..dc21fce32 100644
--- a/scipost_django/journals/templates/journals/create_metadata_xml.html
+++ b/scipost_django/journals/templates/journals/create_metadata_xml.html
@@ -1,6 +1,8 @@
 {% extends 'scipost/base.html' %}
 
-{% block pagetitle %}: Create metadata xml{% endblock pagetitle %}
+{% block pagetitle %}
+  : Create metadata xml
+{% endblock pagetitle %}
 
 {% block breadcrumb %}
   <div class="breadcrumb-container">
@@ -22,20 +24,38 @@
     <div class="col-12">
       <h1 class="highlight">Create metadata XML (for Crossref deposit)</h1>
       <p>
-        The following field is prefilled with data from the Publication object. Once you accept them, they will overwrite the current metadata, shown below.
+        The following field is prefilled with data from the Publication object, formatted for readability. Once you accept them, they will be flattened to a single line and overwrite the current metadata shown below.
       </p>
-      <br>
-      <form action="{% url 'journals:create_metadata_xml' publication.doi_label %}" method="post">
+      <br />
+      <form action="{% url 'journals:create_metadata_xml' publication.doi_label %}"
+            method="post">
         {% csrf_token %}
         {{ form|bootstrap }}
-        <input type="submit" class="btn btn-primary" value="Accept the metadata">
+
+        {% if errors %}
+          <div class="bg-danger bg-opacity-10 p-3 mb-2">
+            <span class="fw-bold me-2">XML validation result:</span>
+            <ul class="mb-0">
+
+              {% for error in errors %}<li>{{ error }}</li>{% endfor %}
+
+            </ul>
+          </div>
+        {% else %}
+          <div class="bg-success bg-opacity-10 p-3 mb-2">
+            <span class="fw-bold me-2">XML validation result:</span>
+            Valid
+          </div>
+        {% endif %}
+
+        <input type="submit" class="btn btn-primary" value="Accept the metadata" />
         <a href="{% url 'journals:manage_metadata' %}" class="ms-3 btn btn-link">Back to Admin</a>
       </form>
 
-      <hr class="divider">
+      <hr class="divider" />
 
       <h3>Current metadata xml</h3>
-      <br>
+      <br />
       <pre><code>{{ publication.metadata_xml|linebreaksbr }}</code></pre>
     </div>
   </div>
diff --git a/scipost_django/journals/views.py b/scipost_django/journals/views.py
index d39a12839..c654e1eb7 100644
--- a/scipost_django/journals/views.py
+++ b/scipost_django/journals/views.py
@@ -9,6 +9,7 @@ import os
 import random
 import string
 import shutil
+from typing import Any, Dict
 import requests
 
 import matplotlib
@@ -1068,6 +1069,13 @@ class CreateMetadataXMLView(
     form_class = CreateMetadataXMLForm
     template_name = "journals/create_metadata_xml.html"
 
+    def get_context_data(self, **kwargs: Any) -> Dict[str, Any]:
+        context = super().get_context_data(**kwargs)
+        form = context["form"]
+        valid, errors, xml_str = form.validate_xml(form.xml_str)
+
+        return {**context, "valid": valid, "errors": errors, "xml_str": xml_str}
+
 
 @permission_required("scipost.can_draft_publication", return_403=True)
 @transaction.atomic
-- 
GitLab