publication.py

__copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)"
__license__ = "AGPL v3"


from decimal import Decimal
from string import Template as string_Template
import re
from typing import TYPE_CHECKING

from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import Min, Sum, OuterRef, Exists, Q
from django.urls import reverse
from django.utils.functional import cached_property

from ..constants import (
    STATUS_DRAFT,
    STATUS_PUBLISHED,
    PUBLICATION_PUBLISHED,
    CCBY4,
    CC_LICENSES,
    CC_LICENSES_URI,
    PUBLICATION_STATUSES,
)
from ..helpers import paper_nr_string
from ..managers import PublicationQuerySet
from ..validators import doi_publication_validator

from common.utils import get_current_domain
from finances.models import PubFrac
from scipost.constants import SCIPOST_APPROACHES
from scipost.fields import ChoiceArrayField


if TYPE_CHECKING:
    from django.db.models.manager import RelatedManager
    from production.models import ProofsRepository
    from profiles.models import Profile


class PublicationAuthorsTable(models.Model):
    """
    PublicationAuthorsTable represents an author of a Publication.

    Fields:

    * publication
    * profile
    * affiliations: for this author/Publication (supersede profile.affiliations)
    * order: the ordinal position of this author in this Publication's list of authors.
    """

    publication = models.ForeignKey(
        "journals.Publication", on_delete=models.CASCADE, related_name="authors"
    )
    profile = models.ForeignKey["Profile"](
        "profiles.Profile", on_delete=models.PROTECT, blank=True, null=True
    )
    affiliations = models.ManyToManyField("organizations.Organization", blank=True)
    order = models.PositiveSmallIntegerField()

    class Meta:
        ordering = ("order",)

    def __str__(self):
        return str(self.profile)

    def save(self, *args, **kwargs):
        """Auto increment order number if not explicitly set."""
        if not self.order:
            self.order = self.publication.authors.count() + 1
        return super().save(*args, **kwargs)

    def is_empty(self) -> bool:
        """Check if object is a temporary placeholder."""
        return self.profile is None

    @property
    def first_name(self) -> str:
        """Return first name of author. If not available, return None."""
        return self.profile.first_name if self.profile else None

    @property
    def last_name(self) -> str:
        """Return last name of author. If not available, return None."""
        return self.profile.last_name if self.profile else None


class Publication(models.Model):
    """A Publication is an object directly related to an accepted Submission.

    It contains metadata, the actual publication file, author data, etc. etc.
    It may be directly related to a Journal or to an Issue.
    """

    PUBTYPE_ARTICLE = "article"
    PUBTYPE_CODEBASE_RELEASE = "codebase_release"
    PUBTYPE_CHOICES = (
        (PUBTYPE_ARTICLE, "Article"),
        (PUBTYPE_CODEBASE_RELEASE, "Codebase release"),
    )
    pubtype = models.CharField(
        max_length=32,
        choices=PUBTYPE_CHOICES,
        default=PUBTYPE_ARTICLE,
    )

    # Publication data
    accepted_submission = models.ForeignKey(
        "submissions.Submission",
        on_delete=models.CASCADE,
    )
    in_issue = models.ForeignKey(
        "journals.Issue",
        on_delete=models.CASCADE,
        null=True,
        blank=True,
        help_text="Assign either an Issue or Journal to the Publication",
    )
    in_journal = models.ForeignKey(
        "journals.Journal",
        on_delete=models.CASCADE,
        null=True,
        blank=True,
        help_text="Assign either an Issue or Journal to the Publication",
    )
    paper_nr = models.PositiveSmallIntegerField()
    paper_nr_suffix = models.CharField(max_length=32, blank=True)
    status = models.CharField(
        max_length=8, choices=PUBLICATION_STATUSES, default=STATUS_DRAFT
    )

    # Core fields
    title = models.CharField(max_length=300)
    author_list = models.CharField(max_length=10000, verbose_name="author list")
    abstract = models.TextField()
    abstract_jats = models.TextField(
        blank=True,
        default="",
        help_text="JATS version of abstract for Crossref deposit",
    )
    pdf_file = models.FileField(
        upload_to="UPLOADS/PUBLICATIONS/%Y/%m/", max_length=200, blank=True
    )

    # Ontology-based semantic linking
    acad_field = models.ForeignKey(
        "ontology.AcademicField", on_delete=models.PROTECT, related_name="publications"
    )
    specialties = models.ManyToManyField(
        "ontology.Specialty", related_name="publications"
    )
    topics = models.ManyToManyField("ontology.Topic", blank=True)
    approaches = ChoiceArrayField(
        models.CharField(max_length=24, choices=SCIPOST_APPROACHES),
        blank=True,
        null=True,
    )

    cc_license = models.CharField(max_length=32, choices=CC_LICENSES, default=CCBY4)

    # Funders
    grants = models.ManyToManyField("funders.Grant", blank=True)
    funders_generic = models.ManyToManyField(
        "funders.Funder", blank=True
    )  # not linked to a grant

    # Metadata
    metadata = models.JSONField(default=dict, blank=True, null=True)
    metadata_xml = models.TextField(blank=True)  # for Crossref deposit
    metadata_DOAJ = models.JSONField(default=dict, blank=True, null=True)
    doi_label = models.CharField(
        max_length=200,
        unique=True,
        db_index=True,
        validators=[doi_publication_validator],
    )
    doideposit_needs_updating = models.BooleanField(default=False)
    citedby = models.JSONField(default=dict, blank=True, null=True)
    number_of_citations = models.PositiveIntegerField(default=0)

    # To handle cases without parsable author info (e.g. docx)
    author_info_source = models.TextField(blank=True, null=True)

    # Date fields
    submission_date = models.DateField(verbose_name="submission date")
    acceptance_date = models.DateField(verbose_name="acceptance date")
    publication_date = models.DateField(verbose_name="publication date")
    latest_citedby_update = models.DateTimeField(null=True, blank=True)
    latest_metadata_update = models.DateTimeField(blank=True, null=True)
    latest_activity = models.DateTimeField(
        auto_now=True
    )  # Needs `auto_now` as its not explicity updated anywhere?

    # Calculated fields
    cf_citation = models.CharField(
        max_length=1024,
        blank=True,
        help_text="NB: calculated field. Do not modify.",
    )
    cf_author_affiliation_indices_list = ArrayField(
        ArrayField(
            models.PositiveSmallIntegerField(blank=True, null=True), default=list
        ),
        default=list,
        help_text="NB: calculated field. Do not modify.",
    )

    objects = PublicationQuerySet.as_manager()

    if TYPE_CHECKING:
        authors = RelatedManager[PublicationAuthorsTable]

    class Meta:
        default_related_name = "publications"
        ordering = ("-publication_date", "-paper_nr")

    def __str__(self):
        return "{cite}, {title} by {authors}, {date}".format(
            cite=self.citation,
            title=self.title[:30],
            authors=self.author_list[:30],
            date=self.publication_date.strftime("%Y-%m-%d"),
        )

    def clean(self):
        """Check if either a valid Journal or Issue is assigned to the Publication."""
        if not (self.in_journal or self.in_issue):
            raise ValidationError(
                {
                    "in_journal": ValidationError(
                        "Either assign a Journal or Issue to this Publication",
                        code="required",
                    ),
                    "in_issue": ValidationError(
                        "Either assign a Journal or Issue to this Publication",
                        code="required",
                    ),
                }
            )
        if self.in_journal and self.in_issue:
            # Assigning both a Journal and an Issue will screw up the database
            raise ValidationError(
                {
                    "in_journal": ValidationError(
                        "Either assign only a Journal or Issue to this Publication",
                        code="invalid",
                    ),
                    "in_issue": ValidationError(
                        "Either assign only a Journal or Issue to this Publication",
                        code="invalid",
                    ),
                }
            )
        if self.in_issue and not self.get_journal().has_issues:
            # Assigning both a Journal and an Issue will screw up the database
            raise ValidationError(
                {
                    "in_issue": ValidationError(
                        "This journal does not allow the use of Issues", code="invalid"
                    ),
                }
            )
        if self.in_journal and self.get_journal().has_issues:
            # Assigning both a Journal and an Issue will screw up the database
            raise ValidationError(
                {
                    "in_journal": ValidationError(
                        "This journal does not allow the use of individual Publications",
                        code="invalid",
                    ),
                }
            )

    def get_absolute_url(self):
        return reverse("scipost:publication_detail", args=(self.doi_label,))

    def get_cc_license_URI(self):
        for key, val in CC_LICENSES_URI:
            if key == self.cc_license:
                return val
        raise KeyError

    def get_all_affiliations(self):
        """
        Returns all author affiliations.
        """
        from organizations.models import Organization

        return (
            Organization.objects.filter(publicationauthorstable__publication=self)
            .annotate(order=Min("publicationauthorstable__order"))
            .order_by("order")
        )

    def get_author_affiliation_indices_list(self):
        """
        Return a list containing for each author an ordered list of affiliation indices.

        This is for display on the publication detail page,
        and is a calculated field (saved in the model) to avoid
        unnecessary db queries (problematic for papers with large number of authors).
        """
        if len(self.cf_author_affiliation_indices_list) > 0:
            return self.cf_author_affiliation_indices_list

        indexed_author_list = []
        pub_affiliations = self.get_all_affiliations()
        pub_authors = self.authors.all().prefetch_related("affiliations")

        for author in pub_authors:
            affnrs = []
            for idx, aff in enumerate(pub_affiliations):
                if aff in author.affiliations.all():
                    affnrs.append(idx + 1)

            # If no affiliation is found, add a None to the list
            # Prevents crashes since Django v4
            if len(affnrs) == 0:
                affnrs = [None]

            indexed_author_list.append(affnrs)

        # Since nested ArrayFields must have the same dimension,
        # we pad the "empty" entries with Null:
        max_length = 0
        for entry in indexed_author_list:
            max_length = max(max_length, len(entry))

        padded_list = []
        for entry in indexed_author_list:
            padded_entry = entry + [None] * (max_length - len(entry))
            padded_list.append(padded_entry)

        # Save into the calculated field for future purposes.
        Publication.objects.filter(id=self.id).update(
            cf_author_affiliation_indices_list=padded_list
        )
        return padded_list

    def get_all_funders(self):
        from funders.models import Funder

        return Funder.objects.annotate(
            has_publication_in_grants=Exists(self.grants.filter(funder=OuterRef("pk"))),
            has_publication_in_funders_generic=Exists(
                self.funders_generic.filter(pk=OuterRef("pk"))
            ),
        ).filter(
            Q(has_publication_in_grants=True)
            | Q(has_publication_in_funders_generic=True)
        )

    def get_affiliations(self):
        """Returns the Organizations mentioned in author affiliations."""
        from organizations.models import Organization

        return Organization.objects.filter(
            publicationauthorstable__publication=self
        ).distinct()

    @property
    def doi_string(self) -> str:
        return "10.21468/" + self.doi_label

    @property
    def is_draft(self) -> bool:
        """Check if the publication is in draft status, awaiting to be published."""
        return self.status == STATUS_DRAFT

    @property
    def is_published(self) -> bool:
        if self.status != PUBLICATION_PUBLISHED:
            return False

        if self.in_issue:
            return self.in_issue.status == STATUS_PUBLISHED
        elif self.in_journal:
            return self.in_journal.active
        return False

    @property
    def has_abstract_jats(self) -> bool:
        """Check if there is a JATS version of the abstract used for Crossref deposits."""
        return self.abstract_jats != ""

    @property
    def has_xml_metadata(self) -> bool:
        """Check if there is XML metadata used for Crossref deposits."""
        return self.metadata_xml != ""

    @property
    def BiBTeX(self):
        bibtex_entry = ("@Article{%s,\n\ttitle={{%s}},\n\tauthor={%s},\n") % (
            self.doi_string,
            self.title,
            self.author_list.replace(",", " and"),
        )
        if self.in_issue:
            if self.in_issue.in_volume:
                bibtex_entry += "\tjournal={%s},\n\tvolume={%i},\n" % (
                    self.in_issue.in_volume.in_journal.name_abbrev,
                    self.in_issue.in_volume.number,
                )
            elif self.in_issue.in_journal:
                bibtex_entry += (
                    "\tjournal={%s},\n" % self.in_issue.in_journal.name_abbrev
                )
        elif self.in_journal:
            bibtex_entry += "\tjournal={%s},\n" % self.in_journal.name_abbrev
        bibtex_entry += (
            "\tpages={%s},\n"
            "\tyear={%s},\n"
            "\tpublisher={SciPost},\n"
            "\tdoi={%s},\n"
            "\turl={https://%s/%s},\n"
            "}"
        ) % (
            self.get_paper_nr(),
            self.publication_date.strftime("%Y"),
            self.doi_string,
            get_current_domain(),
            self.doi_string,
        )
        return bibtex_entry

    def resources_as_md(self):
        """Return a Markdown string representing the list of resources."""
        if self.resources.all():
            resources_md = "## Resources:\n\n"
            for resource in self.resources.all():
                resources_md += (
                    f"* {resource.get__type_display()} "
                    f"at [{resource.url}]({resource.url})\n"
                )
            return resources_md
        return None

    @property
    def has_citation_list(self):
        return (
            "citation_list" in self.metadata and len(self.metadata["citation_list"]) > 0
        )

    @property
    def has_funding_statement(self):
        return (
            "funding_statement" in self.metadata and self.metadata["funding_statement"]
        )

    @property
    def expenditures(self):
        """The expenditures (as defined by the Journal) to produce this Publication."""
        return self.get_journal().cost_per_publication(self.publication_date.year)

    def recalculate_pubfracs(self):
        """Recalculates PubFracs using the balanced affiliations algorithm."""
        # First, remove non-affiliation-related PubFracs
        aff_ids = [aff.id for aff in self.get_affiliations()]
        PubFrac.objects.filter(publication=self).exclude(
            organization__pk__in=aff_ids
        ).delete()
        # Now recreate according to the balanced affiliations algorithm
        nr_authors = self.authors.all().count()
        affiliations = self.get_affiliations()
        for org in affiliations.all():
            pubfrac, created = PubFrac.objects.get_or_create(
                publication=self, organization=org
            )
        fraction = {}
        for org in affiliations.all():
            fraction[org.id] = 0
        for author in self.authors.all():
            nr_affiliations = author.affiliations.all().count()
            for aff in author.affiliations.all():
                fraction[aff.id] += 1.0 / (nr_authors * nr_affiliations)
        for org in affiliations.all():
            pubfrac, created = PubFrac.objects.get_or_create(
                publication=self,
                organization=org,
            )
            # ensure 3 digit accuracy for all fractions through integer cast
            pubfrac.fraction = 0.001 * int(1000 * fraction[org.id])
            pubfrac.save()
        self.ensure_pubfracs_sum_to_1()

    def ensure_pubfracs_sum_to_1(self):
        """Ensure sum is 1 by putting any difference in the largest PubFrac."""
        pubfrac_max = self.pubfracs.order_by("-fraction").first()
        sum_pubfracs = self.pubfracs.aggregate(Sum("fraction"))["fraction__sum"]
        pubfrac_max.fraction += 1 - sum_pubfracs
        pubfrac_max.save()

    @property
    def pubfracs_sum_to_1(self):
        """Checks that the support fractions sum up to one."""
        return self.pubfracs.aggregate(Sum("fraction"))["fraction__sum"] == 1

    @property
    def proofs_repository(self) -> "ProofsRepository":
        """Return the proofs repository for the publication."""
        return self.accepted_submission.production_stream.proofs_repository

    @property
    def compensated_expenditures(self):
        """Compensated part of expenditures for this Publication."""
        qs = self.pubfracs.filter(compensated_by__isnull=False)
        # Use the fraction to obtain an accurate result
        return (
            qs.aggregate(Sum("fraction"))["fraction__sum"] * self.expenditures
            if qs.exists()
            else 0
        )

    @property
    def uncompensated_expenditures(self):
        """Unompensated part of expenditures for this Publication."""
        return self.expenditures - self.compensated_expenditures

    @property
    def citation(self):
        if self.cf_citation:
            return self.cf_citation
        citation = ""
        """Return Publication name in the preferred citation format."""
        if self.in_issue and self.in_issue.in_volume:
            citation = "{journal} {volume}, {paper_nr} ({year})".format(
                journal=self.in_issue.in_volume.in_journal.name_abbrev,
                volume=self.in_issue.in_volume.number,
                paper_nr=self.get_paper_nr(),
                year=self.publication_date.strftime("%Y"),
            )
        elif self.in_issue and self.in_issue.in_journal:
            citation = "{journal} {issue}, {paper_nr} ({year})".format(
                journal=self.in_issue.in_journal.name_abbrev,
                issue=self.in_issue.number,
                paper_nr=self.get_paper_nr(),
                year=self.publication_date.strftime("%Y"),
            )
        elif self.in_journal:
            citation = "{journal} {paper_nr} ({year})".format(
                journal=self.in_journal.name_abbrev,
                paper_nr=self.get_paper_nr(),
                year=self.publication_date.strftime("%Y"),
            )
        else:
            citation = "{paper_nr} ({year})".format(
                paper_nr=self.get_paper_nr(), year=self.publication_date.strftime("%Y")
            )
        self.cf_citation = citation
        self.save()
        return citation

    def get_journal(self):
        if self.in_journal:
            return self.in_journal
        elif self.in_issue.in_journal:
            return self.in_issue.in_journal
        return self.in_issue.in_volume.in_journal

    def journal_issn(self):
        return self.get_journal().issn

    def get_volume(self):
        if self.in_issue and self.in_issue.in_volume:
            return self.in_issue.in_volume
        return None

    def get_paper_nr(self):
        """Returns the paper number (including possible suffixes) as a string."""
        s = f"{self.paper_nr}" if self.in_journal else paper_nr_string(self.paper_nr)
        return f"{s}{f'-{self.paper_nr_suffix}' if self.paper_nr_suffix else ''}"

    def citation_rate(self):
        """Returns the citation rate in units of nr citations per article per year."""
        if self.citedby and self.latest_citedby_update:
            ncites = len(self.citedby)
            deltat = (self.latest_citedby_update.date() - self.publication_date).days
            return ncites * 365.25 / deltat
        else:
            return 0

    def get_issue_related_publications(self):
        """Return 4 Publications within same Issue."""
        return (
            Publication.objects.published()
            .filter(in_issue=self.in_issue)
            .exclude(id=self.id)[:4]
        )

    @property
    def bundle(self):
        """Returns a QuerySet of all Publications with same DOI anchor."""
        doi_anchor = self.doi_label.partition("-")[0]
        return Publication.objects.filter(
            models.Q(doi_label=doi_anchor)
            | models.Q(doi_label__startswith=f"{doi_anchor}-")
        ).order_by("doi_label")

    @cached_property
    def tex_contents(self) -> str | None:
        return self.author_info_source or self.proofs_repository.fetch_tex()

    @cached_property
    def tex_affiliations(self) -> list[tuple[str, str]]:
        """
        Extracts the affiliations from the TeX contents, constructing a list
        with each affiliation as an (identifier, text) pair
        Matches the pattern: `{\\bf #} Affiliation Text ...\\\\`
        Returns a list of tuples with the affiliation identifier (number) and the affiliation text.
        """
        affiliations_block = re.search(
            r"%+ TODO: AFFILIATIONS.*?%+ END TODO: AFFILIATIONS",
            self.tex_contents or "",
            re.DOTALL,
        )

        if affiliations_block is None:
            return []

        matches = re.findall(
            r"\n(?:\{\\bf\s([^%]+?)\})?\s?([^%]+?)\n(?:\\\\|%)",
            affiliations_block.group(),
            re.DOTALL,
        )

        return [
            (number or f"UN_{i}", text.replace("\\\\", " ").strip())
            for i, (number, text) in enumerate(matches)
        ]

    @cached_property
    def tex_author_info(self) -> list[tuple[str, list[str]]]:
        """
        Returns a list of tuples with the author name and a list of superscripts.
        """

        authors_block = re.search(
            r"%+ TODO: AUTHORS(.*?)%+ END TODO: AUTHORS",
            self.tex_contents or "",
            re.DOTALL,
        )
        if authors_block is None:
            # If matching against the tex file fails, we use the author_list of the submission.
            return [(author.strip(), []) for author in self.author_list.split(",")]

        author_lines = authors_block.group(1).strip().split("\n")

        def extract_author_info(line: str) -> tuple[str, list[str]]:
            """Extracts the superscripts from a text."""
            author_match = re.match(r"^(?:\\orcidlink\{.*?\}\\?\W?)?([^\\]+)", line)
            superscripts_match = re.match(r".*?\\textsuperscript\{(.*?)\}", line)

            author_text = author_match.group(1) if author_match else ""
            superscripts_text = (
                superscripts_match.group(1) if superscripts_match else ""
            )

            # Remove any trailing or leading "and"s.
            author_text = re.sub(r"(?:^and(\W)|(\W)and$)", r"\1", author_text).strip()

            # Prepend a comma to every $...$ expression to split them later
            superscripts_text = re.sub(r"(\$.+?\$)", r",\1", superscripts_text)

            return (
                author_text.strip(),
                [superscript.strip() for superscript in superscripts_text.split(",")],
            )

        return [extract_author_info(line) for line in author_lines]

    def reset_author_associations(self) -> None:
        """Deletes all PublicationAuthorsTable entries and repopulates it based on the tex file uploaded on git."""
        from profiles.models import Profile

        PublicationAuthorsTable.objects.filter(publication=self).delete()

        # We add the succesful authors from the author_render_list to the PublicationAuthorsTable.
        author_entries: list[PublicationAuthorsTable] = []
        author_names = [name for name, _ in self.tex_author_info]
        for i, name in enumerate(author_names):
            profile_matches = list(Profile.objects.search(name))
            profile = profile_matches[0] if profile_matches else None

            author_entries.append(
                PublicationAuthorsTable(
                    publication=self,
                    profile=profile,
                    order=i + 1,
                )
            )

        PublicationAuthorsTable.objects.bulk_create(author_entries)

        self.cf_author_affiliation_indices_list = []
        if author_names:
            self.author_list = ", ".join(author_names)

        self.save()


class Reference(models.Model):
    """A Refence is a reference used in a specific Publication."""

    reference_number = models.IntegerField()
    publication = models.ForeignKey("journals.Publication", on_delete=models.CASCADE)

    authors = models.CharField(max_length=1028)
    citation = models.CharField(max_length=1028, blank=True)
    identifier = models.CharField(blank=True, max_length=128)
    link = models.URLField(blank=True)

    class Meta:
        unique_together = ("reference_number", "publication")
        ordering = ["reference_number"]
        default_related_name = "references"

    def __str__(self):
        return "[{}] {}, {}".format(
            self.reference_number, self.authors[:30], self.citation[:30]
        )