Skip to content

Text

rigour.text

dam_levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None) cached

Compute the Damerau-Levenshtein distance between two strings.

Parameters:

Name Type Description Default
left str

A string.

required
right str

A string.

required

Returns:

Type Description
int

An integer of changed characters.

Source code in rigour/text/distance.py
@lru_cache(maxsize=MEMO_SMALL)
def dam_levenshtein(
    left: str,
    right: str,
    max_length: int = env.MAX_NAME_LENGTH,
    max_edits: Optional[int] = None,
) -> int:
    """Compute the Damerau-Levenshtein distance between two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        An integer of changed characters.
    """
    if left == right:
        return 0
    return DamerauLevenshtein.distance(
        left[:max_length],
        right[:max_length],
        score_cutoff=max_edits,
    )

is_levenshtein_plausible(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)

A sanity check to post-filter name matching results based on a budget of allowed Levenshtein distance. This basically cuts off results where the Jaro-Winkler or Metaphone comparison was too lenient.

Parameters:

Name Type Description Default
left str

A string.

required
right str

A string.

required
max_edits Optional[int]

The maximum number of edits allowed.

LEVENSHTEIN_MAX_EDITS
max_percent float

The maximum percentage of edits allowed.

LEVENSHTEIN_MAX_PERCENT

Returns:

Type Description
bool

A boolean.

Source code in rigour/text/distance.py
def is_levenshtein_plausible(
    left: str,
    right: str,
    max_edits: Optional[int] = env.LEVENSHTEIN_MAX_EDITS,
    max_percent: float = env.LEVENSHTEIN_MAX_PERCENT,
    max_length: int = env.MAX_NAME_LENGTH,
) -> bool:
    """A sanity check to post-filter name matching results based on a budget
    of allowed Levenshtein distance. This basically cuts off results where
    the Jaro-Winkler or Metaphone comparison was too lenient.

    Args:
        left: A string.
        right: A string.
        max_edits: The maximum number of edits allowed.
        max_percent: The maximum percentage of edits allowed.

    Returns:
        A boolean.
    """
    left = left[:max_length]
    right = right[:max_length]
    pct_edits = math.ceil(min(len(left), len(right)) * max_percent)
    max_edits_ = min(max_edits, pct_edits) if max_edits is not None else pct_edits
    distance = levenshtein(left, right, max_length, max_edits=max_edits_)
    return distance <= max_edits_

jaro_winkler(left, right, max_length=env.MAX_NAME_LENGTH) cached

Compute the Jaro-Winkler similarity of two strings.

Parameters:

Name Type Description Default
left str

A string.

required
right str

A string.

required

Returns:

Type Description
float

A float between 0.0 and 1.0.

Source code in rigour/text/distance.py
@lru_cache(maxsize=MEMO_SMALL)
def jaro_winkler(left: str, right: str, max_length: int = env.MAX_NAME_LENGTH) -> float:
    """Compute the Jaro-Winkler similarity of two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        A float between 0.0 and 1.0.
    """
    score = JaroWinkler.normalized_similarity(left[:max_length], right[:max_length])
    return score if score > 0.6 else 0.0

levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None) cached

Compute the Levenshtein distance between two strings.

Parameters:

Name Type Description Default
left str

A string.

required
right str

A string.

required

Returns:

Type Description
int

An integer of changed characters.

Source code in rigour/text/distance.py
@lru_cache(maxsize=MEMO_SMALL)
def levenshtein(
    left: str,
    right: str,
    max_length: int = env.MAX_NAME_LENGTH,
    max_edits: Optional[int] = None,
) -> int:
    """Compute the Levenshtein distance between two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        An integer of changed characters.
    """
    if left == right:
        return 0
    return Levenshtein.distance(
        left[:max_length],
        right[:max_length],
        score_cutoff=max_edits,
    )

levenshtein_similarity(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)

Compute the Damerau Levenshtein similarity of two strings. The similiarity is the percentage distance measured against the length of the longest string.

Parameters:

Name Type Description Default
left str

A string.

required
right str

A string.

required
max_edits Optional[int]

The maximum number of edits allowed.

LEVENSHTEIN_MAX_EDITS
max_percent float

The maximum fraction of the shortest string that is allowed to be edited.

LEVENSHTEIN_MAX_PERCENT

Returns:

Type Description
float

A float between 0.0 and 1.0.

Source code in rigour/text/distance.py
def levenshtein_similarity(
    left: str,
    right: str,
    max_edits: Optional[int] = env.LEVENSHTEIN_MAX_EDITS,
    max_percent: float = env.LEVENSHTEIN_MAX_PERCENT,
    max_length: int = env.MAX_NAME_LENGTH,
) -> float:
    """Compute the Damerau Levenshtein similarity of two strings. The similiarity is
    the percentage distance measured against the length of the longest string.

    Args:
        left: A string.
        right: A string.
        max_edits: The maximum number of edits allowed.
        max_percent: The maximum fraction of the shortest string that is allowed to be edited.

    Returns:
        A float between 0.0 and 1.0.
    """
    left_len = len(left)
    right_len = len(right)
    if left_len == 0 or right_len == 0:
        return 0.0

    # Skip results with an overall distance of more than N characters:
    pct_edits = math.ceil(min(left_len, right_len) * max_percent)
    max_edits_ = min(max_edits, pct_edits) if max_edits is not None else pct_edits
    if abs(left_len - right_len) > max_edits_:
        return 0.0

    distance = levenshtein(left, right, max_length=max_length, max_edits=max_edits_)
    if distance > max_edits_:
        return 0.0
    return 1.0 - (float(distance) / max(left_len, right_len))

metaphone(token) cached

Get the metaphone phonetic representation of a token.

Source code in rigour/text/phonetics.py
@lru_cache(maxsize=MEMO_LARGE)
def metaphone(token: str) -> str:
    """Get the metaphone phonetic representation of a token."""
    return metaphone_(token)

remove_bracketed_text(text)

Remove any text in brackets. This is meant to handle names of companies which include the jurisdiction, like: Turtle Management (Seychelles) Ltd.

Parameters:

Name Type Description Default
text str

A text including text in brackets.

required

Returns:

Type Description
str

Text where this has been substituted for whitespace.

Source code in rigour/text/cleaning.py
def remove_bracketed_text(text: str) -> str:
    """Remove any text in brackets. This is meant to handle names of companies
    which include the jurisdiction, like: Turtle Management (Seychelles) Ltd.

    Args:
        text: A text including text in brackets.

    Returns:
        Text where this has been substituted for whitespace.
    """
    return BRACKETED.sub(WS, text)

remove_emoji(string)

Remove unicode ranges used by emoticons, symbolks, flags and other visual codepoints from a piece of text. Primary use case is to remove shit emojis from the names of political office holders coming from Wikidata.

Parameters:

Name Type Description Default
string str

Text that may include emoji and pictographs.

required

Returns:

Type Description
str

Text that doesn't include those.

Source code in rigour/text/cleaning.py
def remove_emoji(string: str) -> str:
    """Remove unicode ranges used by emoticons, symbolks, flags and other visual codepoints from
    a piece of text. Primary use case is to remove shit emojis from the names of political office
    holders coming from Wikidata.

    Args:
        string: Text that may include emoji and pictographs.

    Returns:
        Text that doesn't include those.
    """
    return EMOJI_PATTERN.sub(r"", string)

soundex(token) cached

Get the soundex phonetic representation of a token.

Source code in rigour/text/phonetics.py
@lru_cache(maxsize=MEMO_LARGE)
def soundex(token: str) -> str:
    """Get the soundex phonetic representation of a token."""
    return soundex_(token)