From f3731584a1ff7ea9cf99aa7e64a01d70737dcb3c Mon Sep 17 00:00:00 2001
From: Boris Ponsioen <b.g.t.ponsioen@uva.nl>
Date: Wed, 31 Jan 2018 19:08:19 +0100
Subject: [PATCH] Cleaned up a bit

---
 metacore/models.py                            | 10 ++-
 metacore/services.py                          | 68 +++++++++++++++----
 .../partials/citable_card_content.html        |  2 +-
 3 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/metacore/models.py b/metacore/models.py
index 567a94550..56f5764f1 100644
--- a/metacore/models.py
+++ b/metacore/models.py
@@ -6,6 +6,7 @@ from .managers import CitableQuerySet
 
 
 # Make the connection to MongoDB - this could be put in settings.py as well
+# It uses default settings for the mongo server
 connect('scipost')
 
 class Citable(DynamicDocument):
@@ -14,14 +15,18 @@ class Citable(DynamicDocument):
     (with DOI) or preprint of an published/unpublished document.
     """
 
+    # Fields that are extracted from the source metadata in order to normalize
+    # some of the data for searching / metrics
     references = ListField(StringField())
-    metadata = DynamicField()
     authors = ListField(StringField())
     title = StringField()
     publisher = StringField()
     license = URLField()
     publication_date = DateTimeField()
 
+    # Dump all the raw source metadata here
+    metadata = DynamicField()
+
     # Settings for mongoengine
     meta = {
             'queryset_class': CitableQuerySet, # use the custom queryset
@@ -35,6 +40,9 @@ class Citable(DynamicDocument):
     def author_list(self):
         return '; '.join(self.authors)
 
+    def crossref_ref_count(self):
+        return self.metadata['reference-count']
+
 
 class CitableWithDOI(Citable):
     """
diff --git a/metacore/services.py b/metacore/services.py
index e55fcefde..0e0d37663 100644
--- a/metacore/services.py
+++ b/metacore/services.py
@@ -6,18 +6,37 @@ def get_crossref_test():
     For testing purposes - retrieves a small dataset from CrossRef and saves it
     in de database, after parsing
     """
-    url = 'https://api.crossref.org/works'
-    params = {'query.publisher-name': 'scipost', 'rows': 1000}
-    r = requests.get(url, params=params)
+    # url = 'https://api.crossref.org/works'
+    url = 'https://api.crossref.org/members/16/works'
+    cursor = '*'
+    cursor = 'AoJ79tDrpd8CPwtodHRwOi8vZHguZG9pLm9yZy8xMC4xMTAzL3BoeXNyZXZiLjQyLjgxMjU='
+    rows = 1000
 
-    citables_json = r.json()['message']['items']
+    for i in range(1,100):
+        print("Batch %s" % (i, ))
+        print("-------------------------------")
+        print(cursor)
+        # params = {'query.publisher-name': 'American Physical Society', 'cursor': cursor, 'rows': rows}
+        params = {'cursor': cursor, 'rows': rows, 'mailto': 'b.g.t.ponsioen@uva.nl'}
+        r = requests.get(url, params=params)
+        r_json = r.json()
 
-    citables = [parse_crossref_citable(it) for it in citables_json]
-    citables = [citable for citable in citables if citable is not None]
+        citables_json = r_json['message']['items']
+        cursor = r_json['message']['next-cursor']
+        number_of_results = len(r_json['message']['items'])
+        print(number_of_results)
 
-    # Mass insert in database (will fail on encountering existing documents
-    # with same DOI
-    return Citable.objects.insert(citables)
+        citables = [parse_crossref_citable(it) for it in citables_json]
+        citables = [citable for citable in citables if citable is not None]
+
+        # Mass insert in database (will fail on encountering existing documents
+        # with same DOI
+        if citables:
+            Citable.objects.insert(citables)
+
+        if number_of_results < rows:
+            print('End reached.')
+            break
 
 def parse_crossref_citable(citable_item):
     if not citable_item['type'] == 'journal-article':
@@ -32,17 +51,36 @@ def parse_crossref_citable(citable_item):
         try:
             # Parse certain fields for storage on top level in document
             # Blame the convoluted joining and looping on CR
-            references_with_doi = [ref for ref in citable_item['reference'] if 'DOI' in ref]
-            references = [ref['DOI'] for ref in references_with_doi]
-            authors = [' '.join([author_names['given'], author_names['family']]) for author_names in citable_item['author']]
+
+            if 'reference' in citable_item:
+                references_with_doi = [ref for ref in citable_item['reference'] if 'DOI' in ref]
+                references = [ref['DOI'] for ref in references_with_doi]
+            else:
+                references = []
+
+            authors = []
+            for author_names in citable_item['author']:
+                author = []
+                if 'given' in author_names:
+                    author.append(author_names['given'])
+                if 'family' in author_names:
+                    author.append(author_names['family'])
+
+                authors.append(' '.join(author))
+
             publisher = citable_item['publisher']
             title = citable_item['title'][0]
-            publication_date = '-'.join([str(date_part) for date_part in citable_item['published-online']['date-parts'][0]])
-            license = citable_item['license'][0]['URL']
+            publication_date = '-'.join([str(date_part) for date_part in citable_item['issued']['date-parts'][0]])
+            if 'license' in citable_item:
+                license = citable_item['license'][0]['URL']
+            else:
+                license = ''
 
             return CitableWithDOI(doi=doi, references=references, authors=authors, publisher=publisher, title=title, 
                     publication_date=publication_date, license=license, metadata=citable_item)
 
+        except BaseException as e:
+            print(e)
+            # raise
         except:
             print(citable_item)
-            raise
diff --git a/metacore/templates/partials/citable_card_content.html b/metacore/templates/partials/citable_card_content.html
index cfbe61343..5fe65565e 100644
--- a/metacore/templates/partials/citable_card_content.html
+++ b/metacore/templates/partials/citable_card_content.html
@@ -8,7 +8,7 @@
 
 {% block card_footer %}
     <p class="text-muted mb-0">
-      Cited {{ citable.times_cited }} times
+      Cited {{ citable.times_cited }} times (Crossref: {{ citable.crossref_ref_count }}) | DOI {{ citable.doi }} | 
       <br>
       Published {{ citable.publication_date|date:"d-m-Y" }} by {{ citable.publisher }}
     </p>
-- 
GitLab