Skip to content

Names

rigour.names

Name handling utilities for person and organisation names.

name_parts(name, person=True, organisation=True)

Split a name into parts, and return a list of NamePart objects.

Source code in rigour/names/part.py
def name_parts(
    name: str, person: bool = True, organisation: bool = True
) -> List[NamePart]:
    """Split a name into parts, and return a list of NamePart objects."""
    name = prepare_tokenize_name(name)
    if person:
        name = remove_person_prefixes(name)
    parts = name.split(WS)
    # TODO: remove person name prefixes
    # TODO: chunk down organisation legal forms
    return [NamePart(part, index=i) for i, part in enumerate(parts) if len(part)]

pick_name(names)

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
Optional[str]

Optional[str]: The best name for display.

Source code in rigour/names/pick.py
def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().lower()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.9:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if norm is not None and len(norm):
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None