From 5f7c6fbfe7f88912badf7a7a6d5895f0753206b8 Mon Sep 17 00:00:00 2001
From: "J.-S. Caux" <J.S.Caux@uva.nl>
Date: Sat, 26 Jun 2021 15:53:47 +0200
Subject: [PATCH] Add basic internal plagiarism checks

---
 scipost_django/submissions/mixins.py          |  2 +-
 .../admin/plagiarism_internal_check.html      | 72 +++++++++++++++++++
 .../submissions/admin/plagiarism_report.html  | 20 +++---
 .../admin/submission_prescreening.html        |  3 +
 scipost_django/submissions/urls.py            |  2 +
 scipost_django/submissions/views.py           | 49 ++++++++++++-
 6 files changed, 136 insertions(+), 12 deletions(-)
 create mode 100644 scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html

diff --git a/scipost_django/submissions/mixins.py b/scipost_django/submissions/mixins.py
index ee7a26bd1..d096c3054 100644
--- a/scipost_django/submissions/mixins.py
+++ b/scipost_django/submissions/mixins.py
@@ -68,7 +68,7 @@ class SubmissionFormViewMixin:
 class SubmissionAdminViewMixin(FriendlyPermissionMixin, SubmissionFormViewMixin):
     """
     This mixin will provide all basic methods and checks required for Submission
-    administrational actions regarding Submissions.
+    administration actions regarding Submissions.
 
     :editorial_page: Submission is element of the set pool() if False,
                      else Submission is element of the subset: editorial_page()
diff --git a/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html b/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html
new file mode 100644
index 000000000..699c5bb2c
--- /dev/null
+++ b/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html
@@ -0,0 +1,72 @@
+{% extends 'submissions/admin/base.html' %}
+
+{% block pagetitle %}: plagiarism internal check ({{ submission.preprint.identifier_w_vn_nr }}){% endblock pagetitle %}
+
+{% block breadcrumb_items %}
+  {{ block.super }}
+  <span class="breadcrumb-item"><a href="{% url 'submissions:do_prescreening' submission.preprint.identifier_w_vn_nr %}">Pre-screening {{ submission.preprint.identifier_w_vn_nr }}</a></span>
+  <span class="breadcrumb-item">Plagiarism internal check ({{ submission.preprint.identifier_w_vn_nr }})</span>
+{% endblock %}
+
+{% block content %}
+  <h1>Plagiarism internal check for <a href="{{ submission.get_absolute_url }}">{{ submission.preprint.identifier_w_vn_nr }}</a></h1>
+  <h2>{{  submission.title }}</h2>
+  <h3 class="mb-4">by {{ submission.author_list }}</h3>
+
+  <h1 class="highlight">Submission matches</h1>
+  <table class="table">
+    <thead>
+      <tr>
+	<th>Submission</th>
+	<th>Title&nbsp;match&nbsp;&#37;</th>
+	<th>Authors&nbsp;match&nbsp;&#37;</th>
+	<th>Abstract&nbsp;match&nbsp;&#37;</th>
+      </tr>
+    </thead>
+    <tbody>
+      {% for match in submission_matches %}
+	<tr>
+	  <td>
+	    {{ match.submission.preprint.identifier_w_vn_nr }}<br>
+	    <a href="{{ match.submission.get_absolute_url }}" target="_blank">{{ match.submission.title }}</a><br>
+	    {{ match.submission.author_list }}
+	  </td>
+	  <td>{{ match.ratio_title|floatformat:2 }}</td>
+	  <td>{{ match.ratio_authors|floatformat:2 }}</td>
+	  <td>{{ match.ratio_abstract|floatformat:2 }}</td>
+	</tr>
+      {% empty %}
+	<tr><td>No matching Submissions</td></tr>
+      {% endfor %}
+    </tbody>
+  </table>
+
+  <h1 class="highlight">Publication matches</h1>
+  <table class="table">
+    <thead>
+      <tr>
+	<th>Publication</th>
+	<th>Title&nbsp;match&nbsp;&#37;</th>
+	<th>Authors&nbsp;match&nbsp;&#37;</th>
+	<th>Abstract&nbsp;match&nbsp;&#37;</th>
+      </tr>
+    </thead>
+    <tbody>
+      {% for match in publication_matches %}
+	<tr>
+	  <td>
+	    {{ match.publication.doi_label }}<br>
+	    <a href="{{ match.publication.get_absolute_url }}" target="_blank">{{ match.publication.title }}</a><br>
+	    {{ match.publication.author_list }}
+	  </td>
+	  <td>{{ match.ratio_title|floatformat:2 }}</td>
+	  <td>{{ match.ratio_authors|floatformat:2 }}</td>
+	  <td>{{ match.ratio_abstract|floatformat:2 }}</td>
+	</tr>
+      {% empty %}
+	<tr><td>No matching Publications</td></tr>
+      {% endfor %}
+    </tbody>
+  </table>
+
+{% endblock content %}
diff --git a/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html b/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html
index 5486b5247..d8db6c77f 100644
--- a/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html
+++ b/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html
@@ -5,35 +5,35 @@
 {% block pagetitle %}: plagiarism report ({{ submission.preprint.identifier_w_vn_nr }}){% endblock pagetitle %}
 
 {% block breadcrumb_items %}
-  {{block.super}}
+  {{ block.super }}
   <span class="breadcrumb-item">Plagiarism Report ({{ submission.preprint.identifier_w_vn_nr }})</span>
 {% endblock %}
 
 {% block content %}
-  <h1>Plagiarism Report for <a href="{{submission.get_absolute_url}}">{{submission.preprint.identifier_w_vn_nr}}</a></h1>
-  <h2>{{submission.title}}</h2>
-  <h3 class="mb-4">by {{submission.author_list}}</h3>
+  <h1>Plagiarism Report for <a href="{{ submission.get_absolute_url }}">{{ submission.preprint.identifier_w_vn_nr }}</a></h1>
+  <h2>{{  submission.title }}</h2>
+  <h3 class="mb-4">by {{ submission.author_list }}</h3>
   {% if submission.plagiarism_report %}
     <table>
       <tr>
         <td style="min-width: 150px;">iThenticate document</td>
-        <td>{{submission.plagiarism_report.doc_id}}</td>
+        <td>{{ submission.plagiarism_report.doc_id }}</td>
       </tr>
       <tr>
         <td>Percent match</td>
-        <td>{{submission.plagiarism_report.percent_match}}%</td>
+        <td>{{ submission.plagiarism_report.percent_match }}%</td>
       </tr>
       <tr>
         <td>Processed</td>
-        <td>{{submission.plagiarism_report.processed_time}}</td>
+        <td>{{ submission.plagiarism_report.processed_time }}</td>
       </tr>
       <tr>
         <td>Uploaded</td>
-        <td>{{submission.plagiarism_report.uploaded_time}}</td>
+        <td>{{ submission.plagiarism_report.uploaded_time }}</td>
       </tr>
       <tr>
         <td>Latest update</td>
-        <td>{{submission.plagiarism_report.latest_activity}}</td>
+        <td>{{ submission.plagiarism_report.latest_activity }}</td>
       </tr>
     </table>
   {% else %}
@@ -42,7 +42,7 @@
 
   <form method="post" class="mt-3" enctype="multipart/form-data">
     {% csrf_token %}
-    {{form|bootstrap}}
+    {{ form|bootstrap }}
     <input type="submit" class="btn btn-primary" value="{% if submission.plagiarism_report %}Update report status{% else %}Submit submission for plagiarism check{% endif %}">
     {% if submission.plagiarism_report %}
       <a href="{% url 'submissions:plagiarism_report' submission.preprint.identifier_w_vn_nr %}" class="ms-2 btn btn-default">Download report pdf</a>
diff --git a/scipost_django/submissions/templates/submissions/admin/submission_prescreening.html b/scipost_django/submissions/templates/submissions/admin/submission_prescreening.html
index b88cbddee..1eb5a2398 100644
--- a/scipost_django/submissions/templates/submissions/admin/submission_prescreening.html
+++ b/scipost_django/submissions/templates/submissions/admin/submission_prescreening.html
@@ -29,6 +29,9 @@
       <span class="text-success">{% include 'bi/check-square-fill.html' %}</span>
       <a href="{% url 'colleges:submission' submission.preprint.identifier_w_vn_nr %}">Manage this submission's Fellowship ({{ submission.fellows.count }} fellows)</a><br>
     </li>
+    <li>
+      <a href="{% url 'submissions:plagiarism_internal' submission.preprint.identifier_w_vn_nr %}">Check internal plagiarism</a>
+    </li>
     <li>
       {% if submission.plagiarism_report %}
         <span class="text-success">{% include 'bi/check-square-fill.html' %}</span>
diff --git a/scipost_django/submissions/urls.py b/scipost_django/submissions/urls.py
index 9f477960e..2208e0380 100644
--- a/scipost_django/submissions/urls.py
+++ b/scipost_django/submissions/urls.py
@@ -75,6 +75,8 @@ urlpatterns = [
         views.PlagiarismView.as_view(), name='plagiarism'),
     url(r'^admin/{regex}/plagiarism/report$'.format(regex=SUBMISSIONS_COMPLETE_REGEX),
         views.PlagiarismReportPDFView.as_view(), name='plagiarism_report'),
+url(r'^admin/{regex}/plagiarism/internal$'.format(regex=SUBMISSIONS_COMPLETE_REGEX),
+        views.PlagiarismInternalView.as_view(), name='plagiarism_internal'),
     url(
         r'^admin/{regex}/recommendation$'.format(
         regex=SUBMISSIONS_COMPLETE_REGEX), views.EICRecommendationDetailView.as_view(),
diff --git a/scipost_django/submissions/views.py b/scipost_django/submissions/views.py
index 566b24432..358e3ce15 100644
--- a/scipost_django/submissions/views.py
+++ b/scipost_django/submissions/views.py
@@ -3,6 +3,7 @@ __license__ = "AGPL v3"
 
 
 import datetime
+from difflib import SequenceMatcher
 import feedparser
 import strings
 
@@ -63,7 +64,7 @@ from common.helpers import get_new_secrets_key
 from common.utils import workdays_between
 from invitations.constants import STATUS_SENT
 from invitations.models import RegistrationInvitation
-from journals.models import Journal
+from journals.models import Journal, Publication
 from mails.utils import DirectMailUtil
 from mails.views import MailEditorSubview
 from ontology.models import Topic
@@ -2335,6 +2336,52 @@ class PlagiarismReportPDFView(SubmissionAdminViewMixin, SingleObjectMixin, Redir
         return url
 
 
+class PlagiarismInternalView(
+        SubmissionAdminViewMixin, DetailView):
+    """
+    Check for matching title, author, abstract in Submissions and Publications.
+    """
+    permission_required = 'scipost.can_run_pre_screening'
+    template_name = 'submissions/admin/plagiarism_internal_check.html'
+    editorial_page = True
+    success_url = reverse_lazy('submissions:pool')
+
+    def get_context_data(self, *args, **kwargs):
+        context = super().get_context_data(*args, **kwargs)
+        submission = self.get_object()
+
+        # Submissions:
+        submission_matches = []
+        for sub in Submission.objects.exclude(pk=submission.id):
+            ratio_title = SequenceMatcher(None, submission.title, sub.title).ratio()
+            ratio_authors = SequenceMatcher(None, submission.author_list, sub.author_list).ratio()
+            ratio_abstract = SequenceMatcher(None, submission.abstract, sub.abstract).ratio()
+            if (ratio_title >= 0.8 or ratio_abstract > 0.8):
+                submission_matches.append({
+                    'submission': sub,
+                    'ratio_title': ratio_title,
+                    'ratio_authors': ratio_authors,
+                    'ratio_abstract': ratio_abstract
+                })
+        context['submission_matches'] = submission_matches
+
+        # Publications:
+        publication_matches = []
+        for pub in Publication.objects.all():
+            ratio_title = SequenceMatcher(None, submission.title, pub.title).ratio()
+            ratio_authors = SequenceMatcher(None, submission.author_list, pub.author_list).ratio()
+            ratio_abstract = SequenceMatcher(None, submission.abstract, pub.abstract).ratio()
+            if (ratio_title >= 0.8 or ratio_abstract > 0.8):
+                publication_matches.append({
+                    'publication': pub,
+                    'ratio_title': ratio_title,
+                    'ratio_authors': ratio_authors,
+                    'ratio_abstract': ratio_abstract
+                })
+        context['publication_matches'] = publication_matches
+        return context
+
+
 ##############
 # Monitoring #
 ##############
-- 
GitLab