Text

`rigour.text`

`dam_levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None)` `cached`

Compute the Damerau-Levenshtein distance between two strings.

Parameters:

Name	Type	Description	Default
`left`	`str`	A string.	required
`right`	`str`	A string.	required

Returns:

Type	Description
`int`	An integer of changed characters.

Source code in rigour/text/distance.py

@lru_cache(maxsize=MEMO_SMALL)
def dam_levenshtein(
    left: str,
    right: str,
    max_length: int = env.MAX_NAME_LENGTH,
    max_edits: Optional[int] = None,
) -> int:
    """Compute the Damerau-Levenshtein distance between two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        An integer of changed characters.
    """
    if left == right:
        return 0
    return DamerauLevenshtein.distance(
        left[:max_length],
        right[:max_length],
        score_cutoff=max_edits,
    )

`is_levenshtein_plausible(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)`

A sanity check to post-filter name matching results based on a budget of allowed Levenshtein distance. This basically cuts off results where the Jaro-Winkler or Metaphone comparison was too lenient.

Parameters:

Name	Type	Description	Default
`left`	`str`	A string.	required
`right`	`str`	A string.	required
`max_edits`	`Optional[int]`	The maximum number of edits allowed.	`LEVENSHTEIN_MAX_EDITS`
`max_percent`	`float`	The maximum percentage of edits allowed.	`LEVENSHTEIN_MAX_PERCENT`

Returns:

Type	Description
`bool`	A boolean.

Source code in rigour/text/distance.py

def is_levenshtein_plausible(
    left: str,
    right: str,
    max_edits: Optional[int] = env.LEVENSHTEIN_MAX_EDITS,
    max_percent: float = env.LEVENSHTEIN_MAX_PERCENT,
    max_length: int = env.MAX_NAME_LENGTH,
) -> bool:
    """A sanity check to post-filter name matching results based on a budget
    of allowed Levenshtein distance. This basically cuts off results where
    the Jaro-Winkler or Metaphone comparison was too lenient.

    Args:
        left: A string.
        right: A string.
        max_edits: The maximum number of edits allowed.
        max_percent: The maximum percentage of edits allowed.

    Returns:
        A boolean.
    """
    left = left[:max_length]
    right = right[:max_length]
    pct_edits = math.ceil(min(len(left), len(right)) * max_percent)
    max_edits_ = min(max_edits, pct_edits) if max_edits is not None else pct_edits
    distance = levenshtein(left, right, max_length, max_edits=max_edits_)
    return distance <= max_edits_

`jaro_winkler(left, right, max_length=env.MAX_NAME_LENGTH)` `cached`

Compute the Jaro-Winkler similarity of two strings.

Parameters:

Name	Type	Description	Default
`left`	`str`	A string.	required
`right`	`str`	A string.	required

Returns:

Type	Description
`float`	A float between 0.0 and 1.0.

Source code in rigour/text/distance.py

@lru_cache(maxsize=MEMO_SMALL)
def jaro_winkler(left: str, right: str, max_length: int = env.MAX_NAME_LENGTH) -> float:
    """Compute the Jaro-Winkler similarity of two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        A float between 0.0 and 1.0.
    """
    score = JaroWinkler.normalized_similarity(left[:max_length], right[:max_length])
    return score if score > 0.6 else 0.0

`levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None)` `cached`

Compute the Levenshtein distance between two strings.

Parameters:

Name	Type	Description	Default
`left`	`str`	A string.	required
`right`	`str`	A string.	required

Returns:

Type	Description
`int`	An integer of changed characters.

Source code in rigour/text/distance.py

@lru_cache(maxsize=MEMO_SMALL)
def levenshtein(
    left: str,
    right: str,
    max_length: int = env.MAX_NAME_LENGTH,
    max_edits: Optional[int] = None,
) -> int:
    """Compute the Levenshtein distance between two strings.

    Args:
        left: A string.
        right: A string.

    Returns:
        An integer of changed characters.
    """
    if left == right:
        return 0
    return Levenshtein.distance(
        left[:max_length],
        right[:max_length],
        score_cutoff=max_edits,
    )

`levenshtein_similarity(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)`

Compute the Damerau Levenshtein similarity of two strings. The similiarity is the percentage distance measured against the length of the longest string.

Parameters:

Name	Type	Description	Default
`left`	`str`	A string.	required
`right`	`str`	A string.	required
`max_edits`	`Optional[int]`	The maximum number of edits allowed.	`LEVENSHTEIN_MAX_EDITS`
`max_percent`	`float`	The maximum fraction of the shortest string that is allowed to be edited.	`LEVENSHTEIN_MAX_PERCENT`

Returns:

Type	Description
`float`	A float between 0.0 and 1.0.

Source code in rigour/text/distance.py

def levenshtein_similarity(
    left: str,
    right: str,
    max_edits: Optional[int] = env.LEVENSHTEIN_MAX_EDITS,
    max_percent: float = env.LEVENSHTEIN_MAX_PERCENT,
    max_length: int = env.MAX_NAME_LENGTH,
) -> float:
    """Compute the Damerau Levenshtein similarity of two strings. The similiarity is
    the percentage distance measured against the length of the longest string.

    Args:
        left: A string.
        right: A string.
        max_edits: The maximum number of edits allowed.
        max_percent: The maximum fraction of the shortest string that is allowed to be edited.

    Returns:
        A float between 0.0 and 1.0.
    """
    left_len = len(left)
    right_len = len(right)
    if left_len == 0 or right_len == 0:
        return 0.0

    # Skip results with an overall distance of more than N characters:
    pct_edits = math.ceil(min(left_len, right_len) * max_percent)
    max_edits_ = min(max_edits, pct_edits) if max_edits is not None else pct_edits
    if abs(left_len - right_len) > max_edits_:
        return 0.0

    distance = levenshtein(left, right, max_length=max_length, max_edits=max_edits_)
    if distance > max_edits_:
        return 0.0
    return 1.0 - (float(distance) / max(left_len, right_len))

`metaphone(token)` `cached`

Get the metaphone phonetic representation of a token.

Source code in rigour/text/phonetics.py

@lru_cache(maxsize=MEMO_LARGE)
def metaphone(token: str) -> str:
    """Get the metaphone phonetic representation of a token."""
    return metaphone_(token)

`remove_bracketed_text(text)`

Remove any text in brackets. This is meant to handle names of companies which include the jurisdiction, like: Turtle Management (Seychelles) Ltd.

Parameters:

Name	Type	Description	Default
`text`	`str`	A text including text in brackets.	required

Returns:

Type	Description
`str`	Text where this has been substituted for whitespace.

Source code in rigour/text/cleaning.py

def remove_bracketed_text(text: str) -> str:
    """Remove any text in brackets. This is meant to handle names of companies
    which include the jurisdiction, like: Turtle Management (Seychelles) Ltd.

    Args:
        text: A text including text in brackets.

    Returns:
        Text where this has been substituted for whitespace.
    """
    return BRACKETED.sub(WS, text)

`remove_emoji(string)`

Remove unicode ranges used by emoticons, symbolks, flags and other visual codepoints from a piece of text. Primary use case is to remove shit emojis from the names of political office holders coming from Wikidata.

Parameters:

Name	Type	Description	Default
`string`	`str`	Text that may include emoji and pictographs.	required

Returns:

Type	Description
`str`	Text that doesn't include those.

Source code in rigour/text/cleaning.py

def remove_emoji(string: str) -> str:
    """Remove unicode ranges used by emoticons, symbolks, flags and other visual codepoints from
    a piece of text. Primary use case is to remove shit emojis from the names of political office
    holders coming from Wikidata.

    Args:
        string: Text that may include emoji and pictographs.

    Returns:
        Text that doesn't include those.
    """
    return EMOJI_PATTERN.sub(r"", string)

`soundex(token)` `cached`

Get the soundex phonetic representation of a token.

Source code in rigour/text/phonetics.py

@lru_cache(maxsize=MEMO_LARGE)
def soundex(token: str) -> str:
    """Get the soundex phonetic representation of a token."""
    return soundex_(token)

Text

rigour.text

dam_levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None) cached

is_levenshtein_plausible(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)

jaro_winkler(left, right, max_length=env.MAX_NAME_LENGTH) cached

levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None) cached

levenshtein_similarity(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)

metaphone(token) cached

remove_bracketed_text(text)

remove_emoji(string)

soundex(token) cached

`rigour.text`

`dam_levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None)` `cached`

`is_levenshtein_plausible(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)`

`jaro_winkler(left, right, max_length=env.MAX_NAME_LENGTH)` `cached`

`levenshtein(left, right, max_length=env.MAX_NAME_LENGTH, max_edits=None)` `cached`

`levenshtein_similarity(left, right, max_edits=env.LEVENSHTEIN_MAX_EDITS, max_percent=env.LEVENSHTEIN_MAX_PERCENT, max_length=env.MAX_NAME_LENGTH)`

`metaphone(token)` `cached`

`remove_bracketed_text(text)`

`remove_emoji(string)`

`soundex(token)` `cached`