diff --git a/scipost_django/submissions/admin.py b/scipost_django/submissions/admin.py index 5895f5bfc5cd85dd3637ef48f202d0fea9bd6284..70358f049989e07b578852e97c639319ce81f2ff 100644 --- a/scipost_django/submissions/admin.py +++ b/scipost_django/submissions/admin.py @@ -112,18 +112,15 @@ class SubmissionAdmin(GuardedModelAdmin): fieldsets = ( ( None, - { - "fields": ("preprint", "title", "abstract"), - }, - ), - ( - "Versioning", { "fields": ( - "thread_hash", - "is_current", - "is_resubmission_of", - "list_of_changes", + "preprint", + "title", + "abstract", + "author_list", + "submission_date", + "submitted_to", + "status", ), }, ), @@ -148,20 +145,44 @@ class SubmissionAdmin(GuardedModelAdmin): "classes": ("collapse",), "fields": ( "submitted_by", - "author_list", "authors", "authors_claims", "authors_false_claims", ), }, ), + ( + "Versioning", + { + "classes": ("collapse",), + "fields": ( + "thread_hash", + "is_current", + "is_resubmission_of", + "list_of_changes", + ), + }, + ), + ( + "Plagiarism", + { + "classes": ("collapse",), + "fields": ("internal_plagiarism_matches", "plagiarism_report"), + } + ), + ( + "Conflicts of interest", + { + "classes": ("collapse",), + "fields": ("needs_conflicts_update",), + } + ), ( "Refereeing", { "classes": ("collapse",), "fields": ( "editor_in_charge", - "status", ("visible_public", "visible_pool"), "refereeing_cycle", ("open_for_commenting", "open_for_reporting"), @@ -170,9 +191,7 @@ class SubmissionAdmin(GuardedModelAdmin): "referees_flagged", "referees_suggested", "remarks_for_editors", - "submitted_to", "pdf_refereeing_pack", - "plagiarism_report", "fellows", ), }, @@ -181,7 +200,7 @@ class SubmissionAdmin(GuardedModelAdmin): "Meta", { "classes": ("collapse",), - "fields": ("metadata", "submission_date", "needs_conflicts_update"), + "fields": ("metadata",), }, ), ) diff --git a/scipost_django/submissions/api/serializers/submission.py b/scipost_django/submissions/api/serializers/submission.py index ccbfc8028d870198383cf917639178b31bac346f..a22771e2d1ab3bc395e498808c28374eb1b3c8ef 100644 --- a/scipost_django/submissions/api/serializers/submission.py +++ b/scipost_django/submissions/api/serializers/submission.py @@ -84,6 +84,7 @@ class SubmissionPublicSerializer(serializers.ModelSerializer): class SubmissionPublicSearchSerializer(serializers.ModelSerializer): identifier = serializers.CharField(source="preprint.identifier_w_vn_nr") submission_date = serializers.CharField(source="submission_date_ymd") + original_submission_date = serializers.CharField(source="original_submission_date_ymd") url = serializers.URLField(source="get_absolute_url") class Meta: diff --git a/scipost_django/submissions/migrations/0114_submission_internal_plagiarism_matches.py b/scipost_django/submissions/migrations/0114_submission_internal_plagiarism_matches.py new file mode 100644 index 0000000000000000000000000000000000000000..f87bdeaf80510204c88eff284c5e7a3fb7b68bfa --- /dev/null +++ b/scipost_django/submissions/migrations/0114_submission_internal_plagiarism_matches.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.14 on 2022-11-09 07:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('submissions', '0113_alter_refereeinvitation_options'), + ] + + operations = [ + migrations.AddField( + model_name='submission', + name='internal_plagiarism_matches', + field=models.JSONField(blank=True, default=dict, null=True), + ), + ] diff --git a/scipost_django/submissions/models/submission.py b/scipost_django/submissions/models/submission.py index 131f0c9be671762b08eb64aebf810dc71760d37b..ad344779de188e9bff4f2635cadd35a7b89b6743 100644 --- a/scipost_django/submissions/models/submission.py +++ b/scipost_django/submissions/models/submission.py @@ -167,6 +167,11 @@ class Submission(models.Model): blank=True, related_name="to_submission", ) + internal_plagiarism_matches = models.JSONField( + default=dict, + blank=True, + null=True, + ) pdf_refereeing_pack = models.FileField( upload_to="UPLOADS/REFEREE/%Y/%m/", max_length=200, blank=True @@ -341,6 +346,11 @@ class Submission(models.Model): """Return the submission date in YYYY-MM-DD format.""" return self.submission_date.date() + @property + def original_submission_date_ymd(self): + """Return the submission date in YYYY-MM-DD format.""" + return self.original_submission_date.date() + @property def original_submission_date(self): """Return the submission_date of the first Submission in the thread.""" diff --git a/scipost_django/submissions/tasks.py b/scipost_django/submissions/tasks.py index 35fc0c98cd9b1f5df8bfad223157922e82980551..be9f6c40db43c931108124ed2656e757742d11f2 100644 --- a/scipost_django/submissions/tasks.py +++ b/scipost_django/submissions/tasks.py @@ -2,16 +2,22 @@ __copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)" __license__ = "AGPL v3" from datetime import timedelta +from difflib import SequenceMatcher + from django.utils import timezone from SciPost_v1.celery import app from .models import Submission, EditorialAssignment, RefereeInvitation, Report +from journals.models import Publication + @app.task(bind=True) def send_editorial_assignment_invitations(self): - """Send invitation email to 'next EditorialAssignment' in queue.""" + """ + Send next queued editorial assignment invitation emails. + """ qs = Submission.objects.unassigned().has_editor_invitations_to_be_sent().distinct() submission_ids = qs.values_list("id", flat=True) submissions_count = len(submission_ids) @@ -49,3 +55,75 @@ def submit_submission_document_for_plagiarism(self): for submission in submission_to_update: report = submission.plagiarism_report # do it... + + +@app.task(bind=True) +def check_for_internal_plagiarism_submission_matches(self, ratio_threshold=0.7): + """ + Check incoming Submissions for internal plagiarism with preexisting Submissions. + """ + + submissions_to_check = Submission.objects.exclude( + internal_plagiarism_matches__has_key="submission_matches" + ) + + # Check in small batches to not overwhelm the server + for sub_to_check in submissions_to_check.all()[:5]: + submission_matches = [] + # check all Submissions which predate, and are not in the thread of the sub + for sub in Submission.objects.exclude( + thread_hash=sub_to_check.thread_hash + ).filter(submission_date__lt=sub_to_check.submission_date): + sub_title_sm = SequenceMatcher(None, sub_to_check.title, sub.title) + ratio_title = sub_title_sm.ratio() + ratio_authors = SequenceMatcher( + None, sub_to_check.author_list, sub.author_list + ).ratio() + ratio_abstract = SequenceMatcher( + None, sub_to_check.abstract, sub.abstract + ).ratio() + if ratio_title > ratio_threshold or ratio_abstract > ratio_threshold: + submission_matches.append( + { + "identifier_w_vn_nr": sub.preprint.identifier_w_vn_nr, + "ratio_title": ratio_title, + "ratio_authors": ratio_authors, + "ratio_abstract": ratio_abstract, + } + ) + sub_to_check.internal_plagiarism_matches["submission_matches"] = submission_matches + sub_to_check.save() + + +@app.task(bind=True) +def check_for_internal_plagiarism_publication_matches(self, ratio_threshold=0.7): + """ + Check incoming Submissions for internal plagiarism with existing Publications. + """ + + submissions_to_check = Submission.objects.exclude( + internal_plagiarism_matches__has_key="publication_matches" + ) + + # Check in small batches to not overwhelm the server + for sub_to_check in submissions_to_check.all()[:5]: + publication_matches = [] + for pub in Publication.objects.filter(publication_date__lt=sub_to_check.submission_date): + ratio_title = SequenceMatcher(None, sub_to_check.title, pub.title).ratio() + ratio_authors = SequenceMatcher( + None, sub_to_check.author_list, pub.author_list + ).ratio() + ratio_abstract = SequenceMatcher( + None, sub_to_check.abstract, pub.abstract + ).ratio() + if ratio_title > ratio_threshold or ratio_abstract > ratio_threshold: + publication_matches.append( + { + "doi_label": pub.doi_label, + "ratio_title": ratio_title, + "ratio_authors": ratio_authors, + "ratio_abstract": ratio_abstract, + } + ) + sub_to_check.internal_plagiarism_matches["publication_matches"] = publication_matches + sub_to_check.save() diff --git a/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html b/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html index b73c9ef8176bd9cb29bd3e762fadbf154d0ea806..39e0f0ea5ea93df95eb6ddc78028d6e70565cafd 100644 --- a/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html +++ b/scipost_django/submissions/templates/submissions/admin/plagiarism_internal_check.html @@ -15,6 +15,9 @@ <h3 class="mb-4">by {{ submission.author_list }}</h3> <h1 class="highlight">Submission matches</h1> + {% if not "submission_matches" in submission.internal_plagiarism_matches %} + <h2 class="text-danger border border-danger m-2 p-2">This automated internal plagiarism check has not finished running yet; please come back later!</h2> + {% endif %} <table class="table"> <thead> <tr> @@ -28,7 +31,9 @@ {% for match in submission_matches %} <tr> <td> - {{ match.submission.preprint.identifier_w_vn_nr }}<br> + {{ match.submission.preprint.identifier_w_vn_nr }} +  <small class="text-muted">Thread hash: {{ match.submission.thread_hash }}</small> + <br> <a href="{{ match.submission.get_absolute_url }}" target="_blank">{{ match.submission.title }}</a><br> {{ match.submission.author_list }} </td> @@ -43,6 +48,9 @@ </table> <h1 class="highlight">Publication matches</h1> + {% if not "publication_matches" in submission.internal_plagiarism_matches %} + <h2 class="text-danger border border-danger m-2 p-2">This automated internal plagiarism check has not finished running yet; please come back later!</h2> + {% endif %} <table class="table"> <thead> <tr> diff --git a/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html b/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html index d8db6c77fe6fe1c3748283d863a9e0b9d89bd689..18ec45c65a1406abb5a68c3fea907553a681caf1 100644 --- a/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html +++ b/scipost_django/submissions/templates/submissions/admin/plagiarism_report.html @@ -10,7 +10,7 @@ {% endblock %} {% block content %} - <h1>Plagiarism Report for <a href="{{ submission.get_absolute_url }}">{{ submission.preprint.identifier_w_vn_nr }}</a></h1> + <h1>iThenticate Plagiarism Report for <a href="{{ submission.get_absolute_url }}">{{ submission.preprint.identifier_w_vn_nr }}</a></h1> <h2>{{ submission.title }}</h2> <h3 class="mb-4">by {{ submission.author_list }}</h3> {% if submission.plagiarism_report %} diff --git a/scipost_django/submissions/templates/submissions/pool/_submission_info_table.html b/scipost_django/submissions/templates/submissions/pool/_submission_info_table.html index e5f8016c002155f88783554df995d8c2e475a9c4..c96ab3ff7982b0b33a91223833ab6819708dd6ce 100644 --- a/scipost_django/submissions/templates/submissions/pool/_submission_info_table.html +++ b/scipost_django/submissions/templates/submissions/pool/_submission_info_table.html @@ -99,15 +99,28 @@ {% if perms.scipost.can_do_plagiarism_checks %} <tr> - <td>Plagiarism score</td> + <td>iThenticate plagiarism score</td> <td> {% if submission.plagiarism_report %} {{ submission.plagiarism_report.score }}% +  <a href="{% url 'submissions:plagiarism_report' submission.preprint.identifier_w_vn_nr %}" class="ms-2 btn btn-default" target="_blank">View report pdf</a> {% else %} <a href="{% url 'submissions:plagiarism' submission.preprint.identifier_w_vn_nr %}">Run plagiarism check</a> {% endif %} </td> </tr> + <tr> + <td>Internal plagiarism matches</td> + <td> + {% if submission.internal_plagiarism_matches.submission_matches %} + Sub: {{ submission.internal_plagiarism_matches.submission_matches|length }}  + {% endif %} + {% if submission.internal_plagiarism_matches.publication_matches %} + Pub: {{ submission.internal_plagiarism_matches.publication_matches|length }}  + {% endif %} + <a href="{% url 'submissions:plagiarism_internal' submission.preprint.identifier_w_vn_nr %}" target="_blank">View details</a> + </td> + </tr> {% endif %} {% if perms.scipost.can_manage_college_composition %} diff --git a/scipost_django/submissions/views.py b/scipost_django/submissions/views.py index 503c82d33d1a231f8a0cd6863085448085ec9222..482c4bb101a3c94d923265cc18e4418e84c8f1db 100644 --- a/scipost_django/submissions/views.py +++ b/scipost_django/submissions/views.py @@ -2977,50 +2977,33 @@ class PlagiarismInternalView(SubmissionAdminViewMixin, DetailView): def get_context_data(self, *args, **kwargs): context = super().get_context_data(*args, **kwargs) submission = self.get_object() - ratio_threshold = 0.7 - - # Submissions: - submission_matches = [] - for sub in Submission.objects.exclude(pk=submission.id): - sub_title_sm = SequenceMatcher(None, submission.title, sub.title) - ratio_title = sub_title_sm.ratio() - ratio_authors = SequenceMatcher( - None, submission.author_list, sub.author_list - ).ratio() - ratio_abstract = SequenceMatcher( - None, submission.abstract, sub.abstract - ).ratio() - if ratio_title > ratio_threshold or ratio_abstract > ratio_threshold: - submission_matches.append( + + context["submission_matches"] = [] + if "submission_matches" in submission.internal_plagiarism_matches: + for sub_match in submission.internal_plagiarism_matches["submission_matches"]: + context["submission_matches"].append( { - "submission": sub, - "ratio_title": ratio_title, - "ratio_authors": ratio_authors, - "ratio_abstract": ratio_abstract, + "submission": Submission.objects.get( + preprint__identifier_w_vn_nr=sub_match["identifier_w_vn_nr"], + ), + "ratio_title": sub_match["ratio_title"], + "ratio_authors": sub_match["ratio_authors"], + "ratio_abstract": sub_match["ratio_abstract"], } ) - context["submission_matches"] = submission_matches - - # Publications: - publication_matches = [] - for pub in Publication.objects.all(): - ratio_title = SequenceMatcher(None, submission.title, pub.title).ratio() - ratio_authors = SequenceMatcher( - None, submission.author_list, pub.author_list - ).ratio() - ratio_abstract = SequenceMatcher( - None, submission.abstract, pub.abstract - ).ratio() - if ratio_title > ratio_threshold or ratio_abstract > ratio_threshold: - publication_matches.append( + + context["publication_matches"] = [] + if "publication_matches" in submission.internal_plagiarism_matches: + for pub_match in submission.internal_plagiarism_matches["publication_matches"]: + context["publication_matches"].append( { - "publication": pub, - "ratio_title": ratio_title, - "ratio_authors": ratio_authors, - "ratio_abstract": ratio_abstract, + "publication": Publication.objects.get(doi_label=pub_match["doi_label"]), + "ratio_title": pub_match["ratio_title"], + "ratio_authors": pub_match["ratio_authors"], + "ratio_abstract": pub_match["ratio_abstract"], } ) - context["publication_matches"] = publication_matches + return context