From 4e987d3e0253267de7d0dd0d55d2bcb1fb97d8cc Mon Sep 17 00:00:00 2001 From: Boris Ponsioen <b.g.t.ponsioen@uva.nl> Date: Tue, 17 Apr 2018 12:22:31 +0200 Subject: [PATCH] Adds background tasks for importing into citables, as well as admin panel for metacore journals --- SciPost_v1/settings/base.py | 1 + metacore/admin.py | 39 +++++-- metacore/forms.py | 3 + metacore/managers.py | 2 +- metacore/migrations/0001_initial.py | 29 +++++ .../migrations/0002_auto_20180417_1036.py | 25 ++++ metacore/models.py | 59 +++++++++- metacore/services.py | 107 ++++++++++++++++-- metacore/templates/citable_list.html | 2 +- .../partials/citable_card_content.html | 4 + requirements.txt | 2 +- 11 files changed, 248 insertions(+), 25 deletions(-) create mode 100644 metacore/migrations/0001_initial.py create mode 100644 metacore/migrations/0002_auto_20180417_1036.py diff --git a/SciPost_v1/settings/base.py b/SciPost_v1/settings/base.py index 476ac570d..db35cf629 100644 --- a/SciPost_v1/settings/base.py +++ b/SciPost_v1/settings/base.py @@ -107,6 +107,7 @@ INSTALLED_APPS = ( 'petitions', 'webpack_loader', 'metacore', + 'background_task', ) diff --git a/metacore/admin.py b/metacore/admin.py index 5a7ae8d4e..fd747960c 100644 --- a/metacore/admin.py +++ b/metacore/admin.py @@ -1,17 +1,34 @@ from django.contrib import admin -from .models import Citable, CitableWithDOI -from .services import get_crossref_test +from django.contrib import messages +from .models import Citable, CitableWithDOI, Journal +from .services import get_crossref_test, import_journal_full, get_crossref_work_count # Register your models here. -# def import_data_from_crossref(modeladmin, request, queryset): -# get_crossref_test() +class JournalAdmin(admin.ModelAdmin): + fields = ('name', 'ISSN_digital', 'last_full_sync') + list_display = ('name', 'ISSN_digital', 'last_full_sync', 'count_metacore', 'count_crossref') + actions = ['import_full', 'update_counts'] -# class CitableAdmin(admin.ModelAdmin): -# # list_display = ['title', 'status'] -# # ordering = ['title'] -# actions = [import_data_from_crossref] + def import_full(self, request, queryset): + """ Starts background task to import all works by this journal """ -# def get_queryset(self, request): -# return [] + for journal in queryset: + t = import_journal_full(journal.ISSN_digital) + messages.add_message(request, messages.INFO, 'Import task for journal {} added. Go to Background Tasks -> Tasks in admin to view them'.format(journal.name)) -# admin.site.register(Citable, CitableAdmin) + messages.add_message(request, messages.WARNING, 'Make sure to start the tasks by running ./manage.py process_tasks') + def update_counts(self, request, queryset): + for journal in queryset: + journal.count_metacore = Citable.objects(metadata__ISSN=journal.ISSN_digital).count() + journal.count_crossref = get_crossref_work_count(journal.ISSN_digital) + journal.save() + + messages.add_message(request, messages.INFO, 'Counts updated.') + + def get_actions(self, request): + actions = super().get_actions(request) + if 'delete_selected' in actions: + del actions['delete_selected'] + return actions + +admin.site.register(Journal, JournalAdmin) diff --git a/metacore/forms.py b/metacore/forms.py index 25b1831bd..2563a3c7c 100644 --- a/metacore/forms.py +++ b/metacore/forms.py @@ -8,6 +8,7 @@ class CitableSearchForm(forms.Form): author = forms.CharField(max_length=100, required=False, label="Author(s)") title = forms.CharField(max_length=100, required=False) publisher = forms.CharField(max_length=100, required=False) + journal = forms.CharField(max_length=100, required=False) def search_results(self): """Return all Citable objects according to search""" @@ -16,6 +17,7 @@ class CitableSearchForm(forms.Form): title__icontains=self.cleaned_data.get('title', ''), authors__icontains=self.cleaned_data.get('author', ''), publisher__icontains=self.cleaned_data.get('publisher', ''), + **{'metadata__container-title__icontains': self.cleaned_data.get('journal', '')}, ) else: """If a text index is present, search using the authors/title box is enables""" @@ -23,5 +25,6 @@ class CitableSearchForm(forms.Form): title__icontains=self.cleaned_data.get('title', ''), authors__icontains=self.cleaned_data.get('author', ''), publisher__icontains=self.cleaned_data.get('publisher', ''), + **{'metadata__container-title__icontains': self.cleaned_data.get('journal', '')}, ).omni_search(self.cleaned_data.get('omni'), 'and') diff --git a/metacore/managers.py b/metacore/managers.py index 057eba9f9..64d164bc7 100644 --- a/metacore/managers.py +++ b/metacore/managers.py @@ -10,7 +10,7 @@ class CitableQuerySet(QuerySet): return self.only('references').filter(references=dois) def simple(self): - return self.only('doi', 'title', 'authors', 'metadata.is-referenced-by-count', 'publication_date', 'publisher') + return self.only('doi', 'title', 'authors', 'metadata.is-referenced-by-count', 'publication_date', 'publisher', 'metadata.container-title') def prl(self): return self.filter(metadata__ISSN='0031-9007') diff --git a/metacore/migrations/0001_initial.py b/metacore/migrations/0001_initial.py new file mode 100644 index 000000000..ca1e4f9e6 --- /dev/null +++ b/metacore/migrations/0001_initial.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.4 on 2018-04-17 06:46 +from __future__ import unicode_literals + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Journal', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=250)), + ('ISSN_digital', models.CharField(max_length=9, validators=[django.core.validators.RegexValidator('^[0-9]{4}-[0-9]{3}[0-9X]$')])), + ('ISSN_print', models.CharField(blank=True, max_length=9, null=True, validators=[django.core.validators.RegexValidator('^[0-9]{4}-[0-9]{3}[0-9X]$')])), + ('last_full_sync', models.DateTimeField(blank=True, null=True)), + ('last_cursor', models.CharField(blank=True, max_length=250, null=True)), + ('last_errors', models.TextField(blank=True, null=True)), + ], + ), + ] diff --git a/metacore/migrations/0002_auto_20180417_1036.py b/metacore/migrations/0002_auto_20180417_1036.py new file mode 100644 index 000000000..b6197fb09 --- /dev/null +++ b/metacore/migrations/0002_auto_20180417_1036.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.4 on 2018-04-17 08:36 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('metacore', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='journal', + name='count_crossref', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AddField( + model_name='journal', + name='count_metacore', + field=models.IntegerField(blank=True, null=True), + ), + ] diff --git a/metacore/models.py b/metacore/models.py index 5e55faf0f..f251ce049 100644 --- a/metacore/models.py +++ b/metacore/models.py @@ -1,12 +1,13 @@ from django.db import models from django.conf import settings +from django.core.validators import RegexValidator +import requests from mongoengine import connect, DynamicDocument, ListField, StringField,\ DynamicField, URLField, DateTimeField from .managers import CitableQuerySet - # Make the connection to MongoDB - this could be put in settings.py as well # It uses default settings for the mongo server connect(settings.MONGO_DATABASE['database'], @@ -31,6 +32,7 @@ class Citable(DynamicDocument): publisher = StringField() license = URLField() publication_date = DateTimeField() + journal = StringField() # Dump all the raw source metadata here metadata = DynamicField() @@ -38,7 +40,7 @@ class Citable(DynamicDocument): # Settings for mongoengine meta = { 'queryset_class': CitableQuerySet, # use the custom queryset - 'indexes': ['doi', 'title', 'publication_date', 'publisher', 'references'], # define indices on database + 'indexes': ['doi', 'title', 'publication_date', 'publisher', 'references', 'journal'], # define indices on database 'allow_inheritance': True } """ @@ -46,7 +48,7 @@ class Citable(DynamicDocument): This should be in the readme, but I'll temporarily add it here for ease of use: For the text index, execute this in the mongo shell: use scipost - db.citable.createIndex({authors: "text", title: "text"}) + db.citable.createIndex({authors: "text", title: "text", journal: "text"}) """ def times_cited(self): @@ -68,3 +70,54 @@ class CitableWithDOI(Citable): def times_cited(self): return CitableWithDOI.objects.cited_by(self.doi).count() + +class Journal(models.Model): + """ Provides interface for importing citables of a journal into Metacore """ + + name = models.CharField( + max_length=250, + blank=False) + ISSN_digital = models.CharField( + max_length=9, + validators=[RegexValidator(r'^[0-9]{4}-[0-9]{3}[0-9X]$')], + blank=False) + ISSN_print = models.CharField( + max_length=9, + validators=[RegexValidator(r'^[0-9]{4}-[0-9]{3}[0-9X]$')], + blank=True, null=True) + last_full_sync = models.DateTimeField(blank=True, null=True) + last_cursor = models.CharField( + max_length=250, + blank=True, null=True) + last_errors = models.TextField( + blank=True, null=True) + count_metacore = models.IntegerField(blank=True, null=True) + count_crossref = models.IntegerField(blank=True, null=True) + + + def update_count_metacore(self): + count = Citable.objects(metadata__ISSN=self.ISSN_digital).count() + self.count_metacore = count + + def update_count_crossref(self): + """ + Returns the total number of citables that are present in CR for a given ISSN + + Needs to be merged with .services but need to work out imports first (circular) + """ + + # Formulate the CR query + url = 'https://api.crossref.org/journals/{}/works'.format(self.ISSN_digital) + + # If the loop is allowed to complete, it fetches (rows * batches) records + rows = 0 + + params = {'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'} + r = requests.get(url, params=params) + r_json = r.json() + + result = r_json['message'] + + if 'total-results' in result: + self.count_metacore = result['total-results'] + diff --git a/metacore/services.py b/metacore/services.py index 8ad65b969..9e4996d1b 100644 --- a/metacore/services.py +++ b/metacore/services.py @@ -1,6 +1,83 @@ import requests from .models import Citable, CitableWithDOI +from background_task import background +import logging +logger = logging.getLogger(__name__) + +@background() +def import_journal_full(issn, cursor='*'): + """ + Task to query CrossRef for all works of a journal with given ISSN + and store them in the Metacore mongo database + """ + + # Formulate the CR query + url = 'https://api.crossref.org/journals/{}/works'.format(issn) + + # If the loop is allowed to complete, it fetches (rows * batches) records + rows = 500 + batches = 2000 + last_cursor = cursor + + for i in range(0,batches): + # print("-------------------------------") + # print("Batch %s" % (i, )) + # print("Last cursor: ", last_cursor) + # print("Current cursor: ", cursor) + logger.info("-------------------------------") + logger.info("Batch %s" % (i, )) + logger.info("Last cursor: ", last_cursor) + logger.info("Current cursor: ", cursor) + + params = {'cursor': cursor, 'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'} + last_cursor = cursor + r = requests.get(url, params=params) + r_json = r.json() + + citables_json = r_json['message']['items'] + cursor = r_json['message']['next-cursor'] + number_of_results = len(r_json['message']['items']) + + citables = [parse_crossref_citable(it) for it in citables_json] + # Parser returns False if there's an error + errors = any([not i for i in citables if i == False]) + orig_citables = citables + citables = [citable for citable in citables if citable] + + # Mass insert in database (will fail on encountering existing documents + # with same DOI + if citables: + Citable.objects.insert(citables) + + citable = [] + + if number_of_results < rows: + # print(number_of_results) + # print('End reached.') + logger.info(number_of_results) + logger.info('End reached.') + break + +def get_crossref_work_count(issn): + """ + Returns the total number of citables that are present in CR for a given ISSN + """ + + # Formulate the CR query + url = 'https://api.crossref.org/journals/{}/works'.format(issn) + + # If the loop is allowed to complete, it fetches (rows * batches) records + rows = 0 + + params = {'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'} + r = requests.get(url, params=params) + r_json = r.json() + + result = r_json['message'] + + if 'total-results' in result: + return result['total-results'] def get_crossref_test(cursor='*'): """ @@ -23,10 +100,14 @@ def get_crossref_test(cursor='*'): last_cursor = cursor for i in range(0,batches): - print("-------------------------------") - print("Batch %s" % (i, )) - print("Last cursor: ", last_cursor) - print("Current cursor: ", cursor) + # print("-------------------------------") + # print("Batch %s" % (i, )) + # print("Last cursor: ", last_cursor) + # print("Current cursor: ", cursor) + logger.info("-------------------------------") + logger.info("Batch %s" % (i, )) + logger.info("Last cursor: ", last_cursor) + logger.info("Current cursor: ", cursor) params = {'cursor': cursor, 'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'} last_cursor = cursor @@ -51,8 +132,10 @@ def get_crossref_test(cursor='*'): citable = [] if number_of_results < rows: - print(number_of_results) - print('End reached.') + # print(number_of_results) + # print('End reached.') + logger.info(number_of_results) + logger.info('End reached.') break def convert_doi_to_lower_case(): @@ -107,6 +190,9 @@ def parse_crossref_citable(citable_item): else: license = '' + if 'container-title' in citable_item: + journal = citable_item['container-title'][0] + return CitableWithDOI(doi=doi, references=references, authors=authors, publisher=publisher, title=title, publication_date=publication_date, license=license, metadata=citable_item) @@ -115,6 +201,11 @@ def parse_crossref_citable(citable_item): # print(e) # # raise except Exception as e: - print("Error: ", e) - print(citable_item['DOI']) + # print("Error: ", e) + # print(citable_item['DOI']) + # print(citable_item.keys()) + logger.error("Error: ", e) + logger.error(citable_item['DOI']) + logger.error(citable_item.keys()) return False + diff --git a/metacore/templates/citable_list.html b/metacore/templates/citable_list.html index 1740d90d2..265eb0bfd 100644 --- a/metacore/templates/citable_list.html +++ b/metacore/templates/citable_list.html @@ -44,7 +44,7 @@ <div class="fieldWrapper"> {{ form.omni.errors }} - <label for="{{ form.subject.id_for_label }}">Author and/or title (matches full words)</label> + <label for="{{ form.subject.id_for_label }}">Author, title, journal (matches full words)</label> {{ form.omni }} </div> <br/> diff --git a/metacore/templates/partials/citable_card_content.html b/metacore/templates/partials/citable_card_content.html index f4a8a3688..e26872a7e 100644 --- a/metacore/templates/partials/citable_card_content.html +++ b/metacore/templates/partials/citable_card_content.html @@ -12,5 +12,9 @@ | DOI <a href='https://doi.org/{{ citable.doi }}'> {{ citable.doi }} </a> <br> Published {{ citable.publication_date|date:"d-m-Y" }} by {{ citable.publisher }} + {% if citable.journal %} + <br> + in {{ citable.journal }} + {% endif %} </p> {% endblock %} diff --git a/requirements.txt b/requirements.txt index 6d5976ccc..0d5ea445a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,7 +52,7 @@ html2text # Mongo (Metacore) mongoengine==0.15.0 - +django-background-tasks==1.1.13 # Possibly dead (most probably not used anymore and possibly not up-to-date packages) -- JdW (August 15th, 2017) imagesize==0.7.1 -- GitLab