Identifiers

`rigour.ids`

Handling of person, organisation and object identifiers. This module contains a collection of validation and formatting tools for identifiers. The IdentifierFormat class is the base class for all identifier formats, and it provides a common interface for validation and formatting.

Currently, identifers can be accessed using short aliases, such as "imo" or "isin". In the future, we will need to introduce a proper, structured identification scheme for identifiers, with qualifiers for country (e.g. ru:nalog:inn, us:sam:uei).

`BIC`

Bases: IdentifierFormat

BIC (ISO 9362 Business identifier codes).

Source code in rigour/ids/stdnum_.py

class BIC(IdentifierFormat):
    """BIC (ISO 9362 Business identifier codes)."""

    TITLE = "BIC"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return bic.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            norm = bic.compact(bic.validate(value))
            norm = norm[:8].upper()
            if not cls.is_valid(norm):
                return None
            return norm
        except ValidationError:
            return None

    @classmethod
    def format(cls, value: str) -> str:
        return bic.format(value)

`CPF`

Bases: IdentifierFormat

Cadastro de Pessoas Físicas, Brazilian national identifier

Source code in rigour/ids/stdnum_.py

class CPF(IdentifierFormat):
    """Cadastro de Pessoas Físicas, Brazilian national identifier"""

    TITLE = "CPF"

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return cpf.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return cpf.compact(cpf.validate(value))
        except ValidationError:
            return None

    @classmethod
    def format(cls, value: str) -> str:
        return cpf.format(value)

`FIGI`

Bases: IdentifierFormat

A FIGI number for a security, as managed by OpenFIGI.

Source code in rigour/ids/stdnum_.py

class FIGI(IdentifierFormat):
    """A FIGI number for a security, as managed by OpenFIGI."""

    TITLE = "FIGI"
    STRONG: bool = True

    impl = figi

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return figi.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return figi.compact(figi.validate(value))
        except ValidationError:
            return None

`FormatSpec`

Bases: TypedDict

An identifier format specification.

Source code in rigour/ids/__init__.py

class FormatSpec(TypedDict):
    """An identifier format specification."""

    title: str
    names: List[str]
    description: str

`IBAN`

Bases: IdentifierFormat

An IBAN number for a bank account.

Source code in rigour/ids/stdnum_.py

class IBAN(IdentifierFormat):
    """An IBAN number for a bank account."""

    TITLE = "IBAN"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return iban.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return iban.compact(iban.validate(value))
        except ValidationError:
            return None

    @classmethod
    def format(cls, value: str) -> str:
        return iban.format(value)

`IMO`

Bases: IdentifierFormat

An IMO number for a ship or shipping company

Source code in rigour/ids/imo.py

class IMO(IdentifierFormat):
    """An IMO number for a ship or shipping company"""

    TITLE = "IMO"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, text: str) -> bool:
        """Determine if the given string is a valid IMO number."""
        match = IMO_RE.search(text)
        if match is None:
            return False
        value = match.group(2)
        digits = [int(d) for d in value]

        # Check if it's a vessel IMO number:
        checksum = sum(d * (7 - i) for i, d in enumerate(digits[:-1])) % 10
        if checksum == digits[-1]:
            return True

        # Check if it's a company IMO number:
        checksum = digits[0] * 8 + digits[1] * 6 + digits[2] * 4
        checksum += +digits[3] * 2 + digits[4] * 9 + digits[5] * 7
        checksum = (11 - (checksum % 11)) % 10
        if checksum == digits[-1]:
            return True

        return False

    @classmethod
    def normalize(cls, text: str) -> Optional[str]:
        """Normalize the given string to a valid NPI."""
        match = IMO_RE.search(text)
        if match is None:
            return None
        value = match.group(2)
        if cls.is_valid(value):
            return f"IMO{value}"
        return None

    @classmethod
    def format(cls, value: str) -> str:
        value = value.replace(" ", "")
        if not value.startswith("IMO"):
            value = f"IMO{value}"
        return value

`is_valid(text)` `classmethod`

Determine if the given string is a valid IMO number.

Source code in rigour/ids/imo.py

@classmethod
def is_valid(cls, text: str) -> bool:
    """Determine if the given string is a valid IMO number."""
    match = IMO_RE.search(text)
    if match is None:
        return False
    value = match.group(2)
    digits = [int(d) for d in value]

    # Check if it's a vessel IMO number:
    checksum = sum(d * (7 - i) for i, d in enumerate(digits[:-1])) % 10
    if checksum == digits[-1]:
        return True

    # Check if it's a company IMO number:
    checksum = digits[0] * 8 + digits[1] * 6 + digits[2] * 4
    checksum += +digits[3] * 2 + digits[4] * 9 + digits[5] * 7
    checksum = (11 - (checksum % 11)) % 10
    if checksum == digits[-1]:
        return True

    return False

`normalize(text)` `classmethod`

Normalize the given string to a valid NPI.

Source code in rigour/ids/imo.py

@classmethod
def normalize(cls, text: str) -> Optional[str]:
    """Normalize the given string to a valid NPI."""
    match = IMO_RE.search(text)
    if match is None:
        return None
    value = match.group(2)
    if cls.is_valid(value):
        return f"IMO{value}"
    return None

`INN`

Bases: IdentifierFormat

Russian tax identification number.

Source code in rigour/ids/stdnum_.py

class INN(IdentifierFormat):
    """Russian tax identification number."""

    TITLE = "INN"

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return inn.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return inn.compact(inn.validate(value))
        except ValidationError:
            return None

`ISIN`

Bases: IdentifierFormat

An ISIN number for a security.

Source code in rigour/ids/stdnum_.py

class ISIN(IdentifierFormat):
    """An ISIN number for a security."""

    TITLE = "ISIN"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return isin.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return isin.compact(isin.validate(value))
        except ValidationError:
            return None

`IdentifierFormat`

Bases: object

Base class for identifier types.

Source code in rigour/ids/common.py

class IdentifierFormat(object):
    """Base class for identifier types."""

    TITLE: str = "Generic identifier"
    STRONG: bool = False

    @classmethod
    def is_valid(cls, value: str) -> bool:
        norm = cls.normalize(value)
        return norm is not None and len(norm) > 0

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        return value.strip()

    @classmethod
    def format(cls, value: str) -> str:
        return value.upper()

`LEI`

Bases: IdentifierFormat

Legal Entity Identifier (ISO 17442)

Source code in rigour/ids/stdnum_.py

class LEI(IdentifierFormat):
    """Legal Entity Identifier (ISO 17442)"""

    TITLE = "LEI"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return lei.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return lei.compact(lei.validate(value))
        except ValidationError:
            return None

`NPI`

Bases: IdentifierFormat

National Provider Identifier.

Source code in rigour/ids/npi.py

class NPI(IdentifierFormat):
    """National Provider Identifier."""

    TITLE: str = "NPI"
    STRONG: bool = True

    # cf. https://www.johndcook.com/blog/2024/06/26/npi-number/

    @classmethod
    def is_valid(cls, text: str) -> bool:
        """Determine if the given string is a valid NPI."""
        if NPI_RE.match(text) is None:
            return False

        if text in INVALID:
            return False

        if len(text) == 10:
            text = "80840" + text

        return bool(luhn.is_valid(text))

    @classmethod
    def normalize(cls, text: str) -> Optional[str]:
        """Normalize the given string to a valid NPI."""
        match = NPI_RE.search(text)
        if match is None:
            return None
        value = match.group(1)
        if cls.is_valid(value) and value not in INVALID:
            return value
        return None

`is_valid(text)` `classmethod`

Determine if the given string is a valid NPI.

Source code in rigour/ids/npi.py

@classmethod
def is_valid(cls, text: str) -> bool:
    """Determine if the given string is a valid NPI."""
    if NPI_RE.match(text) is None:
        return False

    if text in INVALID:
        return False

    if len(text) == 10:
        text = "80840" + text

    return bool(luhn.is_valid(text))

`normalize(text)` `classmethod`

Normalize the given string to a valid NPI.

Source code in rigour/ids/npi.py

@classmethod
def normalize(cls, text: str) -> Optional[str]:
    """Normalize the given string to a valid NPI."""
    match = NPI_RE.search(text)
    if match is None:
        return None
    value = match.group(1)
    if cls.is_valid(value) and value not in INVALID:
        return value
    return None

`OGRN`

Bases: IdentifierFormat

Primary State Registration Number (Russian company registration).

Source code in rigour/ids/ogrn.py

class OGRN(IdentifierFormat):
    """Primary State Registration Number (Russian company registration)."""

    TITLE: str = "OGRN"
    STRONG: bool = True

    # cf. https://docs.trellix.com/de-DE/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-945B4343-861E-4A57-8E60-8B6028871BA1.html

    @classmethod
    def is_valid(cls, text: str) -> bool:
        """Determine if the given string is a valid OGRN."""
        if OGRN_RE.match(text) is None:
            return False

        # Validate registration type
        if text[0] == "0":
            return False

        # Validate control digit logic
        control_digit = int(text[-1])
        return control_digit == cls.calculate_control_digit(text)

    @classmethod
    def normalize(cls, text: str) -> Optional[str]:
        """Normalize the given string to a valid OGRN."""
        match = OGRN_RE.search(text)
        if match is None:
            return None
        value = match.group(1)
        if cls.is_valid(value):
            return value
        return None

    @classmethod
    def calculate_control_digit(cls, grn: str) -> Optional[int]:
        if len(grn) == 13:
            number = int(grn[:12])
            mod_result = number % 11
            calculated_digit = mod_result if mod_result != 10 else 0
            return calculated_digit
        elif len(grn) == 15:
            number = int(grn[:14])
            mod_result = number % 13
            calculated_digit = mod_result if mod_result != 10 else 0
            return calculated_digit
        return None

`is_valid(text)` `classmethod`

Determine if the given string is a valid OGRN.

Source code in rigour/ids/ogrn.py

@classmethod
def is_valid(cls, text: str) -> bool:
    """Determine if the given string is a valid OGRN."""
    if OGRN_RE.match(text) is None:
        return False

    # Validate registration type
    if text[0] == "0":
        return False

    # Validate control digit logic
    control_digit = int(text[-1])
    return control_digit == cls.calculate_control_digit(text)

`normalize(text)` `classmethod`

Normalize the given string to a valid OGRN.

Source code in rigour/ids/ogrn.py

@classmethod
def normalize(cls, text: str) -> Optional[str]:
    """Normalize the given string to a valid OGRN."""
    match = OGRN_RE.search(text)
    if match is None:
        return None
    value = match.group(1)
    if cls.is_valid(value):
        return value
    return None

`SSN`

Bases: IdentifierFormat

US Social Security Number

Source code in rigour/ids/stdnum_.py

class SSN(IdentifierFormat):
    """US Social Security Number"""

    TITLE = "SSN"
    STRONG: bool = False

    @classmethod
    def is_valid(cls, value: str) -> bool:
        return ssn.is_valid(value)

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        try:
            return ssn.compact(ssn.validate(value))
        except ValidationError:
            return None

    @classmethod
    def format(cls, value: str) -> str:
        return ssn.format(value)

`StrictFormat`

Bases: IdentifierFormat

A generic identifier type that applies harsh normalization.

Source code in rigour/ids/strict.py

class StrictFormat(IdentifierFormat):
    """A generic identifier type that applies harsh normalization."""

    TITLE: str = "Strict identifier"

    @classmethod
    def is_valid(cls, value: str) -> bool:
        norm = cls.normalize(value)
        return norm is not None and len(norm) > 2

    @classmethod
    def normalize(cls, value: str) -> Optional[str]:
        ascii = ascii_text(value)
        if ascii is None or len(ascii) < 2:
            return None
        chars = [c for c in ascii if c.isalnum()]
        return "".join(chars).upper()

`UEI`

Bases: IdentifierFormat

US GSA Unique Entity ID.

Source code in rigour/ids/uei.py

class UEI(IdentifierFormat):
    """US GSA Unique Entity ID."""

    # https://www.gsa.gov/about-us/organization/federal-acquisition-service/integrated-award-environment-iae/iae-systems-information-kit/uei-technical-specifications-and-api-information

    TITLE: str = "NPI"
    STRONG: bool = False
    """Marked false because the SAM database is massively duplicated, and entities in 
    SAM conflate companies and their owners. This makes UEIs more like cluster IDs than
    unique entity identifiers."""

    @classmethod
    def is_valid(cls, text: str) -> bool:
        """Determine if the given string is a valid NPI."""
        if UEI_RE.match(text) is None:
            return False

        if text.startswith("0"):
            return False

        # TODO: Figure out checksum scheme
        return True

    @classmethod
    def normalize(cls, text: str) -> Optional[str]:
        """Normalize the given string to a valid NPI."""
        match = UEI_RE.search(text)
        if match is None:
            return None
        value = match.group(1)
        if not cls.is_valid(value):
            return None
        return value.upper()

    @classmethod
    def format(cls, value: str) -> str:
        return value.upper()

`STRONG = False` `class-attribute` `instance-attribute`

Marked false because the SAM database is massively duplicated, and entities in SAM conflate companies and their owners. This makes UEIs more like cluster IDs than unique entity identifiers.

`is_valid(text)` `classmethod`

Determine if the given string is a valid NPI.

Source code in rigour/ids/uei.py

@classmethod
def is_valid(cls, text: str) -> bool:
    """Determine if the given string is a valid NPI."""
    if UEI_RE.match(text) is None:
        return False

    if text.startswith("0"):
        return False

    # TODO: Figure out checksum scheme
    return True

`normalize(text)` `classmethod`

Normalize the given string to a valid NPI.

Source code in rigour/ids/uei.py

@classmethod
def normalize(cls, text: str) -> Optional[str]:
    """Normalize the given string to a valid NPI."""
    match = UEI_RE.search(text)
    if match is None:
        return None
    value = match.group(1)
    if not cls.is_valid(value):
        return None
    return value.upper()

`WikidataQID`

Bases: IdentifierFormat

A wikidata item identifier.

Source code in rigour/ids/wikidata.py

class WikidataQID(IdentifierFormat):
    """A wikidata item identifier."""

    TITLE: str = "Wikidata QID"
    STRONG: bool = True

    @classmethod
    def is_valid(cls, text: str) -> bool:
        """Determine if the given string is a valid wikidata QID."""
        return is_qid(text)

    @classmethod
    def normalize(cls, text: str) -> Optional[str]:
        """Normalize the given string to a valid wikidata QID."""
        text = text.rsplit("/", 1)[-1].strip().upper()
        match = QID.match(text)
        if match is None:
            return None
        return text

`is_valid(text)` `classmethod`

Determine if the given string is a valid wikidata QID.

Source code in rigour/ids/wikidata.py

@classmethod
def is_valid(cls, text: str) -> bool:
    """Determine if the given string is a valid wikidata QID."""
    return is_qid(text)

`normalize(text)` `classmethod`

Normalize the given string to a valid wikidata QID.

Source code in rigour/ids/wikidata.py

@classmethod
def normalize(cls, text: str) -> Optional[str]:
    """Normalize the given string to a valid wikidata QID."""
    text = text.rsplit("/", 1)[-1].strip().upper()
    match = QID.match(text)
    if match is None:
        return None
    return text

`get_identifier_format(name)`

Get the identifier type class for the given format name.

Source code in rigour/ids/__init__.py

def get_identifier_format(name: str) -> Type[IdentifierFormat]:
    """Get the identifier type class for the given format name."""
    return FORMATS[name]

`get_identifier_format_names()`

Get a list of all identifier type names.

Source code in rigour/ids/__init__.py

def get_identifier_format_names() -> List[str]:
    """Get a list of all identifier type names."""
    return list(FORMATS.keys())

`get_identifier_formats()`

Get a list of all identifier formats.

Source code in rigour/ids/__init__.py

def get_identifier_formats() -> List[FormatSpec]:
    """Get a list of all identifier formats."""
    formats: List[FormatSpec] = []
    for type_ in set(FORMATS.values()):
        names = [name for name, cls in FORMATS.items() if cls == type_]
        fmt: FormatSpec = {
            "names": names,
            "title": type_.TITLE,
            "description": type_.__doc__ or "",
        }
        formats.append(fmt)
    return sorted(formats, key=lambda f: f["title"])

`get_strong_format_names()` `cached`

Get a list of all strong identifier type names.

Source code in rigour/ids/__init__.py

@cache
def get_strong_format_names() -> List[str]:
    """Get a list of all strong identifier type names."""
    return [name for name, cls in FORMATS.items() if cls.STRONG]

Identifiers

rigour.ids

BIC

CPF

FIGI

FormatSpec

IBAN

IMO

is_valid(text) classmethod

normalize(text) classmethod

INN

ISIN

IdentifierFormat

LEI

NPI

is_valid(text) classmethod

normalize(text) classmethod

OGRN

is_valid(text) classmethod

normalize(text) classmethod

SSN

StrictFormat

UEI

STRONG = False class-attribute instance-attribute

is_valid(text) classmethod

normalize(text) classmethod

WikidataQID

is_valid(text) classmethod

normalize(text) classmethod

get_identifier_format(name)

get_identifier_format_names()

get_identifier_formats()

get_strong_format_names() cached

`rigour.ids`

`BIC`

`CPF`

`FIGI`

`FormatSpec`

`IBAN`

`IMO`

`is_valid(text)` `classmethod`

`normalize(text)` `classmethod`

`INN`

`ISIN`

`IdentifierFormat`

`LEI`

`NPI`

`is_valid(text)` `classmethod`

`normalize(text)` `classmethod`

`OGRN`

`is_valid(text)` `classmethod`

`normalize(text)` `classmethod`

`SSN`

`StrictFormat`

`UEI`

`STRONG = False` `class-attribute` `instance-attribute`

`is_valid(text)` `classmethod`

`normalize(text)` `classmethod`

`WikidataQID`

`is_valid(text)` `classmethod`

`normalize(text)` `classmethod`

`get_identifier_format(name)`

`get_identifier_format_names()`

`get_identifier_formats()`

`get_strong_format_names()` `cached`