Source code for ads.utils

""" General utilities. """

import os
import json
import re
from collections import OrderedDict

from collections.abc import Iterable

_bibcode_regex_pattern = "(?P<year>[0-9]{4})(?P<journal_abbreviation>[A-Za-z0-9\&\.]{5})(?P<volume>[0-9\.]{4})(?P<qualifier>[A-Z0-9\.])(?P<page_number>[0-9\.]{4})(?P<first_letter_of_last_name>[A-Z])"


[docs]def parse_bibcode(bibcode: str) -> dict:
    """
    Parse a bibcode and return a dictionary with the parsed data.

    See https://ui.adsabs.harvard.edu/help/actions/bibcode

    """
    match = re.match(_bibcode_regex_pattern, bibcode)
    if not match:
        raise ValueError(f"Invalid bibcode '{bibcode}'")

    items = {}
    for key, value in match.groupdict().items():
        items[key] = value.strip(".")
    
    # If the qualifier is an integer, then it's the page number.
    if items["qualifier"].isdigit():
        items["page_number"] = items["qualifier"] + items["page_number"]
        items["qualifier"] = ""

    # Parse information like qualifiers?
    return items



[docs]def flatten(struct: Iterable) -> list:
    """
    Create a flat list of all items in the structure.    
    """
    if struct is None:
        return []
    flat = []
    if isinstance(struct, dict):
        for _, result in struct.items():
            flat += flatten(result)
        return flat
    if isinstance(struct, str):
        return [struct]

    try:
        # if iterable
        iterator = iter(struct)
    except TypeError:
        return [struct]

    for result in iterator:
        flat += flatten(result)
    return flat


[docs]def to_bibcode(iterable):
    """
    Return a bibcode for each item in the iterable. 
    
    The iterable could contain :class:`ads.Document` objects, bibcode strings, etc.
    """
    
    if isinstance(iterable, str):
        if not re.match(_bibcode_regex_pattern, iterable):
            raise ValueError(f"Invalid bibcode '{iterable}'")
        return iterable
    elif isinstance(iterable, Iterable):
        return list(map(to_bibcode, iterable))
    else:
        try:
            return iterable.bibcode
        except AttributeError:
            raise TypeError("Expected a bibcode string, an ads.Document, or an iterable of these.")


def _get_data_path(basename=""):
    from ads import __path__
    return os.path.realpath(os.path.join(__path__[0], "../data", basename))

[docs]def setup_database():
    """ Set up the local database for Journals and Affiliations. """

    from ads.models.local import (database, database_path)
    from ads.models.affiliation import Affiliation
    from ads.models.journal import Journal

    ads_dir = os.path.dirname(database_path)
    data_dir = _get_data_path()

    print(f"Using ADS directory {ads_dir}")
    print(f"Looking for data files in {data_dir}")

    # Create the directory if it doesn't exist.
    os.makedirs(ads_dir, exist_ok=True)

    print(f"Remove existing database at {database_path}..")
    try:
        os.remove(database_path)
    except:
        None

    # Close old db
    database.close()

    # Create the databases.
    print(f"Create database")
    with database.atomic():

        database.connect(reuse_if_open=True)
        print(f"Dropping tables")
        database.drop_tables([Affiliation, Journal])
        print(f"Create tables..")
        database.create_tables([Affiliation, Journal])

    with database.atomic():
        database.connect(reuse_if_open=True)

        _journals_path = _get_data_path("journals.json")
        _affiliation_path = _get_data_path("affiliations.tsv")
        _affiliation_country_path = _get_data_path("affiliations_country.tsv")

        print(f"Load countries from {_affiliation_country_path}..")
        
        countries_dict = OrderedDict()
        with open(_affiliation_country_path, "r") as fp:
            for line in fp.readlines()[1:]:
                country, parent_id, child_id, abbrev, canonical_affiliation = line.split("\t")
                country = country or None

                # affiliations.tsv uses "0" to indicate no ID, but affiliations_country.tsv uses ""
                parent_id, child_id = (parent_id or "0", child_id or "0")

                key = f"{parent_id}|{child_id}"
                countries_dict[key] = [country, parent_id, child_id, abbrev, canonical_affiliation]

        print(f"Loaded countries for {len(countries_dict)} affiliations.")
        print(f"Ingest affiliations from {_affiliation_path}..")

        with open(_affiliation_path, "r") as fp:
            for i, line in enumerate(fp.readlines()[1:]):
                parent_id, child_id, abbreviation, canonical_name = line.strip().split("\t")
                
                # Resolve the country with this method order
                # 1. Match by parent_id and child_id.
                # 2. Match by child_id.
                # 3. Match by parent_id.
                keys = (f"{parent_id}|{child_id}", f"|{child_id}", f"{parent_id}|")
                for key in keys:
                    try:
                        country_info = countries_dict[key]
                    except KeyError:
                        continue
                    else:
                        country = country_info[0]
                        break
                else:
                    country = None
                
                if parent_id == "0":
                    parent = None

                else:
                    parent = Affiliation(id=parent_id)

                Affiliation.create(
                    id=child_id,
                    abbreviation=abbreviation,
                    canonical_name=canonical_name,
                    country=country,
                    parent=parent
                )

        print(f"Ingested {i + 1} affiliations")

        # Load in the journals.
        print(f"Ingest journals from {_journals_path}..")
        with open(_journals_path, "r") as fp:
            journals = json.load(fp)
        
        for j, (abbreviation, title) in enumerate(journals.items()):
            Journal.create(abbreviation=abbreviation, title=title)

        print(f"Ingested {j + 1} journals")
        print(f"Done!")