Skip to content

Names

rigour.names

Name handling utilities for person and organisation names.

name_parts(name, person=True, organisation=True)

Split a name into parts, and return a list of NamePart objects.

Source code in rigour/names/part.py
def name_parts(
    name: str, person: bool = True, organisation: bool = True
) -> List[NamePart]:
    """Split a name into parts, and return a list of NamePart objects."""
    name = prepare_tokenize_name(name)
    if person:
        name = remove_person_prefixes(name)
    parts = name.split(WS)
    # TODO: remove person name prefixes
    # TODO: chunk down organisation legal forms
    return [NamePart(part, index=i) for i, part in enumerate(parts) if len(part)]

pick_name(names)

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
Optional[str]

Optional[str]: The best name for display.

Source code in rigour/names/pick.py
def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().lower()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.9:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if norm is not None and len(norm):
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None

tokenize_name(text, min_length=1)

Split a person or entity's name into name parts.

Source code in rigour/names/tokenize.py
def tokenize_name(text: str, min_length: int = 1) -> List[str]:
    """Split a person or entity's name into name parts."""
    # FIXME: Do we want to support CJK scripts at some stage?
    tokens: List[str] = []
    token: List[str] = []
    for char in text:
        if char in ".'’":
            continue
        # TODO: do we want to special case Arabic name parts like
        # al-, el-, il- ?
        cat = unicodedata.category(char)
        chr = TOKEN_SEP_CATEGORIES.get(cat, char)
        if chr is None:
            continue
        if chr == WS:
            # TODO: do we want to support throwing away name prefixes
            # (like Mr., Sir, etc.) here?
            if len(token) >= min_length:
                tokens.append("".join(token))
            token.clear()
            continue
        token.append(chr)

    if len(token) >= min_length:
        tokens.append("".join(token))
    return tokens