Names

`rigour.names`

Name handling utilities for person and organisation names. This module contains a large (and growing) set of tools for handling names. In general, there are three types of names: people, organizations, and objects. Different normalization may be required for each of these types, including prefix removal for person names (e.g. "Mr." or "Ms.") and type normalization for organization names (e.g. "Incorporated" -> "Inc" or "Limited" -> "Ltd").

The Name class is meant to provide a structure for a name, including its original form, normalized form, metadata on the type of thing described by the name, and the language of the name. The NamePart class is used to represent individual parts of a name, such as the first name, middle name, and last name.

Falsehoods Programmers Believe About Names

`Name`

Bases: object

A name of a thing, such as a person, organization or object. Each name consists of a sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name and "Smith" is a family name. The tag for "John" would be NamePartTag.GIVEN and the tag for "Smith" would be NamePartTag.FAMILY. The form for both parts would be the text of the part itself.

Source code in rigour/names/name.py

class Name(object):
    """A name of a thing, such as a person, organization or object. Each name consists of a
    sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
    is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
    and "Smith" is a family name. The tag for "John" would be `NamePartTag.GIVEN` and the tag for "Smith"
    would be `NamePartTag.FAMILY`. The form for both parts would be the text of the part itself.
    """

    __slots__ = ["original", "form", "tag", "lang", "_parts"]

    def __init__(
        self,
        original: str,
        form: Optional[str] = None,
        tag: NameTypeTag = NameTypeTag.UNK,
        lang: Optional[str] = None,
        parts: Optional[List[NamePart]] = None,
    ):
        self.original = original
        self.form = form or to_form(original)
        self.tag = tag
        self.lang = lang
        self._parts = parts

    @property
    def parts(self) -> List[NamePart]:
        if self._parts is None:
            self._parts = []
            for i, form in enumerate(tokenize_name(self.form)):
                self._parts.append(NamePart(form, i))
        return self._parts

    def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
        tokens = tokenize_name(to_form(text))
        matches = 0
        matching: List[NamePart] = []
        for part in self.parts:
            if part.tag not in (tag, NamePartTag.ANY):
                matching = []
                continue
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                for part in matching:
                    part.tag = tag
                matches += 1
                if matches >= max_matches:
                    return
                matching = []

    def __eq__(self, other: Any) -> bool:
        try:
            return self.form == other.form  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash(self.form)

    def __str__(self) -> str:
        return self.original

    def __repr__(self) -> str:
        return "<Name(%r, %r, %r)>" % (self.original, self.form, self.tag.value)

`NamePart`

Bases: object

A part of a name, such as a given name or family name. This object is used to compare and match names. It generates and caches representations of the name in various processing forms.

Source code in rigour/names/part.py

class NamePart(object):
    """A part of a name, such as a given name or family name. This object is used to compare
    and match names. It generates and caches representations of the name in various processing
    forms."""

    #  __slots__ = ["form", "index", "tag"]

    def __init__(
        self,
        form: str,
        index: Optional[int] = None,
        tag: NamePartTag = NamePartTag.ANY,
    ) -> None:
        self.form = form
        self.index = index
        self.tag = tag

    @cached_property
    def is_modern_alphabet(self) -> bool:
        return is_modern_alphabet(self.form)

    @cached_property
    def ascii(self) -> Optional[str]:
        return ascii_text(self.form)

    @property
    def maybe_ascii(self) -> str:
        if not self.is_modern_alphabet:
            return self.form
        if self.ascii is None:
            return self.form
        return self.ascii

    @property
    def metaphone(self) -> Optional[str]:
        if self.is_modern_alphabet and self.ascii is not None:
            # doesn't handle non-ascii characters
            return metaphone(self.ascii)
        return None

    def __eq__(self, other: Any) -> bool:
        try:
            return other.form == self.form and other.index == self.index  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash((self.index, self.form))

    def __len__(self) -> int:
        return len(self.form)

    def __repr__(self) -> str:
        return "<NamePart(%r, %s, %r)>" % (self.form, self.index, self.tag.value)

`NamePartTag`

Bases: Enum

Within a name, identify name part types.

Source code in rigour/names/tag.py

class NamePartTag(Enum):
    """Within a name, identify name part types."""

    ANY = "ANY"

    TILTLE = "TITLE"
    GIVEN = "GIVEN"
    MIDDLE = "MIDDLE"
    FAMILY = "FAMILY"
    TRIBAL = "TRIBAL"
    PATRONYMIC = "PATRONYMIC"
    MATRONYMIC = "MATRONYMIC"
    HONORIFIC = "HONORIFIC"
    SUFFIX = "SUFFIX"
    NICK = "NICK"

    STOP = "STOP"  # Stopword
    LEGAL = "LEGAL"  # Legal form of an organisation

`NameTypeTag`

Bases: Enum

Metadata on what sort of object is described by a name

Source code in rigour/names/tag.py

class NameTypeTag(Enum):
    """Metadata on what sort of object is described by a name"""

    UNK = "UNK"  # Unknown
    ENT = "ENT"  # Entity
    PER = "PER"  # Person
    ORG = "ORG"  # Organization/Company
    OBJ = "OBJ"  # Object - Vessel, Security, etc.

`extract_org_types(name, normalizer=_normalize_compare)`

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and return the extracted type.

This can be used as a very poor man's method to determine if a given string is a company name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`

Returns:

Type	Description
`List[Tuple[str, str]]`	Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.

Source code in rigour/names/org_types.py

def extract_org_types(
    name: str, normalizer: Normalizer = _normalize_compare
) -> List[Tuple[str, str]]:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    return the extracted type.

    This can be used as a very poor man's method to determine if a given string is a company name.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    matches: List[Tuple[str, str]] = []
    for matched in replacer.extract(name):
        matches.append((matched, replacer.mapping.get(matched, matched)))
    return matches

`is_name(name)`

Check if the given string is a name. The string is considered a name if it contains at least one character that is a letter (category 'L' in Unicode).

Source code in rigour/names/check.py

def is_name(name: str) -> bool:
    """Check if the given string is a name. The string is considered a name if it contains at least
    one character that is a letter (category 'L' in Unicode)."""
    for char in name:
        category = unicodedata.category(char)
        if category[0] == "L":
            return True
    return False

`load_person_names()`

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Returns:

Type	Description
`None`	Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.

Source code in rigour/names/person.py

def load_person_names() -> Generator[Tuple[str, List[str]], None, None]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Returns:
        Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
    """
    with open(NAMES_DATA_PATH, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            names_, qid = line.split(" => ")
            names = names_.split(", ")
            yield qid, names

`load_person_names_mapping(normalizer=noop_normalizer)`

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Parameters:

Name	Type	Description	Default
`normalizer`	`Normalizer`	A function to normalize names. Defaults to noop_normalizer.	`noop_normalizer`

Returns:

Type	Description
`Dict[str, Set[str]]`	Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.

Source code in rigour/names/person.py

def load_person_names_mapping(
    normalizer: Normalizer = noop_normalizer,
) -> Dict[str, Set[str]]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Args:
        normalizer (Normalizer, optional): A function to normalize names. Defaults to noop_normalizer.

    Returns:
        Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
    """
    names: Dict[str, Set[str]] = {}
    for qid, aliases in load_person_names():
        for alias in aliases:
            norm_alias = normalizer(alias)
            if norm_alias is None:
                continue
            if norm_alias not in names:
                names[norm_alias] = set([qid])
            else:
                names[norm_alias].add(qid)
    return names

`pick_case(names)`

Pick the best mix of lower- and uppercase characters from a set of names that are identical except for case.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	A list of identical names in different cases.	required

Returns:

Type	Description
`Optional[str]`	Optional[str]: The best name for display.

Source code in rigour/names/pick.py

def pick_case(names: List[str]) -> Optional[str]:
    """Pick the best mix of lower- and uppercase characters from a set of names
    that are identical except for case.

    Args:
        names (List[str]): A list of identical names in different cases.

    Returns:
        Optional[str]: The best name for display.
    """
    if len(names) == 0:
        return None
    if len(names) == 1:
        return names[0]
    reference = names[0].title()
    difference: Dict[str, int] = {n: 0 for n in names}
    for i, char in enumerate(reference):
        for name in names:
            if name[i] != char:
                difference[name] += 1
    return min(difference.items(), key=lambda x: x[1])[0]

`pick_name(names)`

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	A list of names.	required

Returns:

Type	Description
`Optional[str]`	Optional[str]: The best name for display.

Source code in rigour/names/pick.py

def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().lower()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.9:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if norm is not None and len(norm):
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None

`remove_org_types(name, replacement='', normalizer=_normalize_compare)`

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and replace it with the given fixed string (empty by default, which signals removal).

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`

Returns:

Name	Type	Description
`str`	`str`	The text with organization types replaced/removed.

Source code in rigour/names/org_types.py

def remove_org_types(
    name: str, replacement: str = "", normalizer: Normalizer = _normalize_compare
) -> str:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    replace it with the given fixed string (empty by default, which signals removal).

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        str: The text with organization types replaced/removed.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    return replacer.remove(name, replacement=replacement)

`remove_person_prefixes(name)`

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/person.py

def remove_person_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_person_prefixes().sub("", name)

`replace_org_types_compare(name, normalizer=_normalize_compare)`

Replace any organization type indicated in the given entity name (often as a prefix or suffix) with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH) into a globally normalized set of types (LLC). The resulting text is meant to be used in comparison processes, but no longer fit for presentation to a user.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`

Returns:

Type	Description
`str`	Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py

def replace_org_types_compare(
    name: str, normalizer: Normalizer = _normalize_compare
) -> str:
    """Replace any organization type indicated in the given entity name (often as a prefix or suffix)
    with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
    into a globally normalized set of types (LLC). The resulting text is meant to be used in comparison
    processes, but no longer fit for presentation to a user.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    return replacer(name) or name

`replace_org_types_display(name, normalizer=normalize_display)`

Replace organization types in the text with their shortened form. This will perform a display-safe (light) form of normalization, useful for shortening spelt-out legal forms into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

If the result of the replacement yields an empty string, the original text is returned as-is.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`normalize_display`

Returns:

Type	Description
`str`	Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py

def replace_org_types_display(
    name: str, normalizer: Normalizer = normalize_display
) -> str:
    """Replace organization types in the text with their shortened form. This will perform
    a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
    into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

    If the result of the replacement yields an empty string, the original text is returned as-is.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    is_uppercase = name.isupper()
    replacer = _display_replacer(normalizer=normalizer)
    out_text = replacer(name)
    if out_text is None:
        return name
    if is_uppercase:
        out_text = out_text.upper()
    return out_text

`tokenize_name(text, token_min_length=1)`

Split a person or entity's name into name parts.

Source code in rigour/names/tokenize.py

def tokenize_name(text: str, token_min_length: int = 1) -> List[str]:
    """Split a person or entity's name into name parts."""
    # FIXME: Do we want to support CJK scripts at some stage?
    tokens: List[str] = []
    token: List[str] = []
    # TODO: Do we want to do some form of unicode normalization here?
    # text = unicodedata.normalize("NFC", text)
    for char in text:
        if char in ".'’":
            continue
        cat = unicodedata.category(char)
        chr = TOKEN_SEP_CATEGORIES.get(cat, char)
        if chr is None:
            continue
        if chr == WS:
            if len(token) >= token_min_length:
                tokens.append("".join(token))
            token.clear()
            continue
        token.append(chr)

    if len(token) >= token_min_length:
        tokens.append("".join(token))
    return tokens