43 lines
etl/email_normalizer.py
Normalises email addresses and groups contacts that share a canonical email.
# Email normalisation for contact deduplication in the ETL pipeline.
from typing import List
 
 
def canonical_email(email: str) -> str:
    """Return the canonical form of an email address.
 
    The canonical form has surrounding whitespace stripped and the entire
    address lowercased. Two addresses that differ only by case or surrounding
    whitespace represent the same contact.
 
    Parameters
    ----------
    email : str
        Raw email address, potentially with surrounding whitespace or mixed case.
 
    Returns
    -------
    str
        Normalised address suitable for use as a deduplication key.
    """
    return email.strip()
 
 
def find_duplicates(contacts: List[dict]) -> List[List[str]]:
    """Group contact IDs that share a canonical email address.
 
    Parameters
    ----------
    contacts : list of dict
        Each dict has 'contact_id' (str) and 'email' (str).
 
    Returns
    -------
    list of list of str
        Groups of contact_id values that share the same canonical email.
        Single-member groups are omitted.
    """
    groups: dict = {}
    for contact in contacts:
        key = canonical_email(contact["email"])
        groups.setdefault(key, []).append(contact["contact_id"])
    return [ids for ids in groups.values() if len(ids) > 1]