diff --git a/metacore/models.py b/metacore/models.py index 567a94550050874a644833c4948e3e795903e5e5..56f5764f1a7c479ece833550d24d828c335f5934 100644 --- a/metacore/models.py +++ b/metacore/models.py @@ -6,6 +6,7 @@ from .managers import CitableQuerySet # Make the connection to MongoDB - this could be put in settings.py as well +# It uses default settings for the mongo server connect('scipost') class Citable(DynamicDocument): @@ -14,14 +15,18 @@ class Citable(DynamicDocument): (with DOI) or preprint of an published/unpublished document. """ + # Fields that are extracted from the source metadata in order to normalize + # some of the data for searching / metrics references = ListField(StringField()) - metadata = DynamicField() authors = ListField(StringField()) title = StringField() publisher = StringField() license = URLField() publication_date = DateTimeField() + # Dump all the raw source metadata here + metadata = DynamicField() + # Settings for mongoengine meta = { 'queryset_class': CitableQuerySet, # use the custom queryset @@ -35,6 +40,9 @@ class Citable(DynamicDocument): def author_list(self): return '; '.join(self.authors) + def crossref_ref_count(self): + return self.metadata['reference-count'] + class CitableWithDOI(Citable): """ diff --git a/metacore/services.py b/metacore/services.py index e55fcefde45083756cc301f9ccb082a846a42af6..0e0d37663c70f429f22744f472a32480250f4a08 100644 --- a/metacore/services.py +++ b/metacore/services.py @@ -6,18 +6,37 @@ def get_crossref_test(): For testing purposes - retrieves a small dataset from CrossRef and saves it in de database, after parsing """ - url = 'https://api.crossref.org/works' - params = {'query.publisher-name': 'scipost', 'rows': 1000} - r = requests.get(url, params=params) + # url = 'https://api.crossref.org/works' + url = 'https://api.crossref.org/members/16/works' + cursor = '*' + cursor = 'AoJ79tDrpd8CPwtodHRwOi8vZHguZG9pLm9yZy8xMC4xMTAzL3BoeXNyZXZiLjQyLjgxMjU=' + rows = 1000 - citables_json = r.json()['message']['items'] + for i in range(1,100): + print("Batch %s" % (i, )) + print("-------------------------------") + print(cursor) + # params = {'query.publisher-name': 'American Physical Society', 'cursor': cursor, 'rows': rows} + params = {'cursor': cursor, 'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'} + r = requests.get(url, params=params) + r_json = r.json() - citables = [parse_crossref_citable(it) for it in citables_json] - citables = [citable for citable in citables if citable is not None] + citables_json = r_json['message']['items'] + cursor = r_json['message']['next-cursor'] + number_of_results = len(r_json['message']['items']) + print(number_of_results) - # Mass insert in database (will fail on encountering existing documents - # with same DOI - return Citable.objects.insert(citables) + citables = [parse_crossref_citable(it) for it in citables_json] + citables = [citable for citable in citables if citable is not None] + + # Mass insert in database (will fail on encountering existing documents + # with same DOI + if citables: + Citable.objects.insert(citables) + + if number_of_results < rows: + print('End reached.') + break def parse_crossref_citable(citable_item): if not citable_item['type'] == 'journal-article': @@ -32,17 +51,36 @@ def parse_crossref_citable(citable_item): try: # Parse certain fields for storage on top level in document # Blame the convoluted joining and looping on CR - references_with_doi = [ref for ref in citable_item['reference'] if 'DOI' in ref] - references = [ref['DOI'] for ref in references_with_doi] - authors = [' '.join([author_names['given'], author_names['family']]) for author_names in citable_item['author']] + + if 'reference' in citable_item: + references_with_doi = [ref for ref in citable_item['reference'] if 'DOI' in ref] + references = [ref['DOI'] for ref in references_with_doi] + else: + references = [] + + authors = [] + for author_names in citable_item['author']: + author = [] + if 'given' in author_names: + author.append(author_names['given']) + if 'family' in author_names: + author.append(author_names['family']) + + authors.append(' '.join(author)) + publisher = citable_item['publisher'] title = citable_item['title'][0] - publication_date = '-'.join([str(date_part) for date_part in citable_item['published-online']['date-parts'][0]]) - license = citable_item['license'][0]['URL'] + publication_date = '-'.join([str(date_part) for date_part in citable_item['issued']['date-parts'][0]]) + if 'license' in citable_item: + license = citable_item['license'][0]['URL'] + else: + license = '' return CitableWithDOI(doi=doi, references=references, authors=authors, publisher=publisher, title=title, publication_date=publication_date, license=license, metadata=citable_item) + except BaseException as e: + print(e) + # raise except: print(citable_item) - raise diff --git a/metacore/templates/partials/citable_card_content.html b/metacore/templates/partials/citable_card_content.html index cfbe61343867d1180fd0bfa41d17e7a03db1c8c0..5fe65565e36ca3ef634225fae968ef2a7b0483de 100644 --- a/metacore/templates/partials/citable_card_content.html +++ b/metacore/templates/partials/citable_card_content.html @@ -8,7 +8,7 @@ {% block card_footer %} <p class="text-muted mb-0"> - Cited {{ citable.times_cited }} times + Cited {{ citable.times_cited }} times (Crossref: {{ citable.crossref_ref_count }}) | DOI {{ citable.doi }} | <br> Published {{ citable.publication_date|date:"d-m-Y" }} by {{ citable.publisher }} </p>