43 lines
etl/email_normalizer.py
Normalises email addresses and groups contacts that share a canonical email.
# Email normalisation for contact deduplication in the ETL pipeline.from typing import Listdef canonical_email(email: str) -> str: """Return the canonical form of an email address. The canonical form has surrounding whitespace stripped and the entire address lowercased. Two addresses that differ only by case or surrounding whitespace represent the same contact. Parameters ---------- email : str Raw email address, potentially with surrounding whitespace or mixed case. Returns ------- str Normalised address suitable for use as a deduplication key. """ return email.strip()def find_duplicates(contacts: List[dict]) -> List[List[str]]: """Group contact IDs that share a canonical email address. Parameters ---------- contacts : list of dict Each dict has 'contact_id' (str) and 'email' (str). Returns ------- list of list of str Groups of contact_id values that share the same canonical email. Single-member groups are omitted. """ groups: dict = {} for contact in contacts: key = canonical_email(contact["email"]) groups.setdefault(key, []).append(contact["contact_id"]) return [ids for ids in groups.values() if len(ids) > 1]