Skip to content

URLs

rigour.urls

build_url(url, params=None)

Compose a URL with the given query parameters.

Source code in rigour/urls/cleaning.py
def build_url(url: str, params: ParamsType = None) -> str:
    """Compose a URL with the given query parameters."""
    parsed = urlparse(url)
    query = parse_qsl(parsed.query, keep_blank_values=True)
    if params is not None:
        values = params.items() if isinstance(params, Mapping) else params
        query.extend(sorted(values))
    parsed = parsed._replace(query=urlencode(query))
    return urlunparse(parsed)

clean_url(text)

Perform intensive care on URLs to make sure they have a scheme and a host name. If no scheme is given HTTP is assumed.

Source code in rigour/urls/cleaning.py
def clean_url(text: str) -> Optional[str]:
    """Perform intensive care on URLs to make sure they have a scheme
    and a host name. If no scheme is given HTTP is assumed."""
    parsed = _clean_url(text)
    if parsed is None:
        return None
    return parsed.geturl()

clean_url_compare(text)

Destructively clean a URL for comparison.

Source code in rigour/urls/cleaning.py
def clean_url_compare(text: str) -> Optional[str]:
    """Destructively clean a URL for comparison."""
    parsed = _clean_url(text)
    if parsed is None:
        return None
    if parsed.scheme == "https":
        parsed = parsed._replace(scheme="http")
    hostname = parsed.netloc.lower()
    hostname = hostname.replace("www.", "")
    parsed = parsed._replace(netloc=hostname)
    parsed = parsed._replace(fragment="")
    query = parse_qsl(parsed.query, keep_blank_values=False)
    parsed = parsed._replace(query=urlencode(sorted(query)))
    return parsed.geturl()

compare_urls(left, right)

Compare two URLs and return a float between 0 and 1 representing the similarity between them. Before comparison, clean both URLs in a destructive way.

Source code in rigour/urls/compare.py
def compare_urls(left: str, right: str) -> float:
    """Compare two URLs and return a float between 0 and 1 representing the
    similarity between them. Before comparison, clean both URLs in a destructive
    way."""
    left_clean = clean_url_compare(left)
    right_clean = clean_url_compare(right)
    if left_clean is None or right_clean is None:
        return 0.0
    if left_clean == right_clean:
        return 1.0
    return 0.0