internetarchive · pidgezero-one · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
@@ -2,6 +2,7 @@
 import web
 from openlibrary.catalog.utils import flip_name, author_dates_match, key_int
 from openlibrary.core.helpers import extract_year
+from openlibrary.utils import uniq
 
 if TYPE_CHECKING:
     from openlibrary.plugins.upstream.models import Author
@@ -142,10 +143,59 @@ def walk_redirects(obj, seen):
             seen.add(obj['key'])
         return obj
 
-    # Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
-    # then last name with identical birth and death dates (that are not themselves `None` or '').
+    def get_redirected_authors(authors: list["Author"]):
+        if any(a.type.key != '/type/author' for a in authors):
+            seen: set[dict] = set()
+            all_authors = [
+                walk_redirects(a, seen) for a in authors if a['key'] not in seen
+            ]
+            return all_authors
+        return authors
+
+    # Look for OL ID first.
+    if key := author.get("key"):
+        if reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})):
+            # Always match on OL ID, even if remote identifiers don't match.
+            return get_redirected_authors([web.ctx.site.get(k) for k in reply])
+    # Try other identifiers next.
+    if identifiers := author.get("identifiers"):
+        queries = []
+        matched_authors = []
+        # Get all the authors that match any incoming identifier.
+        for identifier, val in identifiers.items():
+            queries.append({"type": "/type/author", f"remote_ids.{identifier}~": val})
+        for query in queries:
+            if reply := list(web.ctx.site.things(query)):
+                matched_authors.extend(
+                    get_redirected_authors([web.ctx.site.get(k) for k in reply])
+                )
+        matched_authors = uniq(matched_authors)
+        # The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
+        highest_matches = 0
+        selected_match = None
+        for a in matched_authors:
+            try:
+                _, matches = a.merge_remote_ids(identifiers)
+                if matches > highest_matches:
+                    selected_match = a
+                    highest_matches = matches
+                elif matches == highest_matches and matches > 0:
+                    # Prioritize the lower OL ID when matched identifiers are equal
+                    selected_match = (
+                        a
+                        if a.get_key_numeric() < selected_match.get_key_numeric()
+                        else selected_match
+                    )
+            except:
+                # Reject if too many conflicts
+                # TODO: raise a flag to librarians here?
+                pass
+        if highest_matches > 0 and selected_match is not None:
+            return [selected_match]
+    # Fall back to name/date matching, which we did before introducing identifiers.
     name = author["name"].replace("*", r"\*")
     queries = [
+        {"type": "/type/author", "name~": name},
         {"type": "/type/author", "name~": name},
         {"type": "/type/author", "alternate_names~": name},
         {
@@ -155,37 +205,17 @@ def walk_redirects(obj, seen):
             "death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
         },  # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
     ]
+    things = []
     for query in queries:
         if reply := list(web.ctx.site.things(query)):
-            break
-
-    authors = [web.ctx.site.get(k) for k in reply]
-    if any(a.type.key != '/type/author' for a in authors):
-        seen: set[dict] = set()
-        authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
-    return authors
-
-
-def find_entity(author: dict[str, Any]) -> "Author | None":
-    """
-    Looks for an existing Author record in OL
-    and returns it if found.
-
-    :param dict author: Author import dict {"name": "Some One"}
-    :return: Existing Author record if found, or None.
-    """
-    assert isinstance(author, dict)
-    things = find_author(author)
-    if author.get('entity_type', 'person') != 'person':
-        return things[0] if things else None
+            things = get_redirected_authors([web.ctx.site.get(k) for k in reply])
     match = []
     seen = set()
     for a in things:
         key = a['key']
         if key in seen:
             continue
         seen.add(key)
-        orig_key = key
         assert a.type.key == '/type/author'
         if 'birth_date' in author and 'birth_date' not in a:
             continue
@@ -195,10 +225,27 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
             continue
         match.append(a)
     if not match:
-        return None
+        return []
     if len(match) == 1:
-        return match[0]
-    return pick_from_matches(author, match)
+        return [match[0]]
+    return [pick_from_matches(author, match)]
+
+
+def find_entity(author: dict[str, Any]) -> "Author | None":
+    """
+    Looks for an existing Author record in OL
+    and returns it if found.
+
+    :param dict author: Author import dict {"name": "Some One"}
+    :return: Existing Author record if found, or None.
+    """
+    assert isinstance(author, dict)
+    things = find_author(author)
+    if "identifiers" in author:
+        for index, t in enumerate(things):
+            t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
+            things[index] = t
+    return things[0] if things else None
 
 
 def remove_author_honorifics(name: str) -> str:
@@ -246,9 +293,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
             new['death_date'] = author['death_date']
         return new
     a = {'type': {'key': '/type/author'}}
-    for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
+    for f in (
+        'name',
+        'title',
+        'personal_name',
+        'birth_date',
+        'death_date',
+        'date',
+        'remote_ids',
+    ):
         if f in author:
             a[f] = author[f]
+    # Import record hitting endpoing should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB.
+    if "identifiers" in author:
+        a["remote_ids"] = author["identifiers"]
     return a
 
 

diff --git a/openlibrary/catalog/add_book/tests/test_load_book.py b/openlibrary/catalog/add_book/tests/test_load_book.py
@@ -136,9 +136,77 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
         new_author_name = import_author(author)
         assert author["name"] == new_author_name["name"]
 
-    def test_first_match_priority_name_and_dates(self, mock_site):
+    def test_first_match_ol_key(self, mock_site):
         """
-        Highest priority match is name, birth date, and death date.
+        Highest priority match is OL key.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_key = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_key)
+
+        # Look for exact match on OL ID, regardless of other fields.
+        # We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
+        searched_author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL4A",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author_different_key["key"]
+
+    def test_second_match_strong_identifier(self, mock_site):
+        """
+        Next highest priority match is any other strong identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
-    def test_second_match_strong_identifier(self, mock_site):
-        """
-        Next highest priority match is any other strong identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
+    def test_second_match_remote_identifier(self, mock_site):
+        """
+        Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
-    def test_second_match_strong_identifier(self, mock_site):
-        """
-        Next highest priority match is any other strong identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
+    def test_second_match_remote_identifier(self, mock_site):
+        """
+        Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_viaf = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_viaf)
+
+        # Look for exact match on VIAF, regardless of name field.
+        searched_author = {
+            "name": "William Brewer",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author["key"]
+
+    def test_third_match_priority_name_and_dates(self, mock_site):
+        """
+        Next highest priority match is name, birth date, and death date.
         """
         self.add_three_existing_authors(mock_site)
 
@@ -201,7 +269,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
         assert isinstance(found, dict)
         assert found["death_date"] == searched_and_not_found_author["death_date"]
 
-    def test_second_match_priority_alternate_names_and_dates(self, mock_site):
+    def test_match_priority_alternate_names_and_dates(self, mock_site):
         """
         Matching, as a unit, alternate name, birth date, and death date, get
         second match priority.

diff --git a/openlibrary/components/AuthorIdentifiers.vue b/openlibrary/components/AuthorIdentifiers.vue
@@ -35,6 +35,8 @@ const identifierPatterns  = {
     lc_naf: /^n[bors]?[0-9]+$/,
     amazon: /^B[0-9A-Za-z]{9}$/,
     youtube: /^@[A-Za-z0-9_\-.]{3,30}/,
+    imdb: /^\w{2}\d+$/,
+    opac_sbn: /^\D{2}[A-Z0-3]V\d{6}$/,
 }
 export default {
     // Props are for external options; if a subelement of this is modified,

diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
@@ -8,6 +8,7 @@
 import web
 import json
 import requests
+import re
 from typing import Any, TypedDict
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -29,7 +30,7 @@
 from openlibrary.core.ratings import Ratings
 from openlibrary.utils import extract_numeric_id_from_olid
 from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
-from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
+from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity, REMOTE_IDS
 
 from . import cache, waitinglist
 
@@ -217,6 +218,10 @@ def _get_d(self):
             "l": self._get_lists_cached(),
         }
 
+    def get_key_numeric(self):
+        """Returns just the numeric part of the key."""
+        return int(re.search(r'\d+', self.key))
-        return int(re.search(r'\d+', self.key))
+        return int(self.key[2:-1])
-        return int(re.search(r'\d+', self.key))
+        return int(self.key[2:-1])
+
 
 class ThingReferenceDict(TypedDict):
     key: ThingKey
@@ -806,6 +811,25 @@ def get_edition_count(self):
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
 
+    def merge_remote_ids(
+        self, incoming_ids: dict[str, str]
+    ) -> tuple[dict[str, str], int]:
+        output = {**self.remote_ids}
+        if len(incoming_ids.items()) == 0:
+            return output, 0
+        matches = 0
+        conflicts = 0
+        for identifier in REMOTE_IDS:
+            if identifier in output and identifier in incoming_ids:
+                if output[identifier] != incoming_ids[identifier]:
+                    conflicts = conflicts + 1
+                else:
+                    output[identifier] = incoming_ids[identifier]
+                    matches = matches + 1
+        if conflicts > matches:
+            raise Exception("wikidata json conflicts with existing remote ids")
+        return output, matches
+
 
 class User(Thing):
     DEFAULT_PREFERENCES = {