Contact Email Normalizer Code Review Challenge

43 lines

etl/email_normalizer.py

Normalises email addresses and groups contacts that share a canonical email.

# Email normalisation for contact deduplication in the ETL pipeline.

from typing import List

def canonical_email(email: str) -> str:

    """Return the canonical form of an email address.

    The canonical form has surrounding whitespace stripped and the entire

    address lowercased. Two addresses that differ only by case or surrounding

    whitespace represent the same contact.

    Parameters

    ----------

    email : str

        Raw email address, potentially with surrounding whitespace or mixed case.

    Returns

    -------

str

        Normalised address suitable for use as a deduplication key.

"""

    return email.strip()

def find_duplicates(contacts: List[dict]) -> List[List[str]]:

    """Group contact IDs that share a canonical email address.

    Parameters

    ----------

    contacts : list of dict

        Each dict has 'contact_id' (str) and 'email' (str).

    Returns

    -------

    list of list of str

        Groups of contact_id values that share the same canonical email.

        Single-member groups are omitted.

"""

    groups: dict = {}

    for contact in contacts:

        key = canonical_email(contact["email"])

        groups.setdefault(key, []).append(contact["contact_id"])

    return [ids for ids in groups.values() if len(ids) > 1]

No findings yet

Click on a line number in the code to add a finding

You need to log in or sign up to submit.

You can inspect the challenge for free, but AI review and results require an account.