MP-Manager/scripts/audit_brand_vs_branches_totals.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""audit_brand_vs_branches_totals.py

Comparativa de conteos totales entre la cuenta de Marca Principal (Monte
Providencia) y la suma de TODAS las sucursales. Las cuentas demo se excluyen
automaticamente (nombre contiene 'demo', case-insensitive).

Lo que produce:

  - Totales agregados (contactos y oportunidades) Marca vs Sucursales.
  - Desglose por sucursal con el conteo local.
  - Listado de contactos en sucursal que no estan en Marca.
  - Listado de contactos en Marca que no estan en la sucursal que les
    corresponde segun el verificador (campo TIENDA del contacto Marca cruzado
    con la columna TIENDA del verificador y la columna ID LOCATION BUCEFALO).
  - Listado de oportunidades en sucursal sin contraparte en Marca.

Es read-only sobre `mp_manager.sqlite`. No toca GHL. La logica vive en
`run_audit()` para que el endpoint del dashboard la reutilice.

Uso CLI:
    python scripts/audit_brand_vs_branches_totals.py
    python scripts/audit_brand_vs_branches_totals.py --show-missing
    python scripts/audit_brand_vs_branches_totals.py --json
    python scripts/audit_brand_vs_branches_totals.py --limit-missing 100
"""

import argparse
import csv
import json
import os
import re
import sqlite3
import sys
import unicodedata
from collections import defaultdict


ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
    sys.path.insert(0, ROOT_DIR)
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
if SCRIPTS_DIR not in sys.path:
    sys.path.insert(0, SCRIPTS_DIR)

from paths import DB_PATH
from common import match_contacts as _match_contacts
VERIFIER_CSV = os.path.join(
    ROOT_DIR, "Monte Providencia - Verificador de sucursales y correos - Sucursales.csv"
)
BRAND_LOCATION_ID = "GbKkBpCmKu2QmloKFHy3"
MATCH_THRESHOLD = 0.80

# Consolidacion de hubs digitales. Algunas sucursales "shell" (fisicas / no
# digitales) no reciben leads digitales: estos viven en la location "hub" que
# las absorbe. El Verificador CSV no siempre tiene la fila digital -> hub para
# cada tienda (p.ej. TIENDA=METEPEC solo mapea a 85937, que esta vacia, mientras
# los leads viven en Pilares 85935). Sin este mapa, esos contactos aparecen como
# "presente en otra sucursal, no la asignada" -> ruido masivo de falsos positivos.
# Documentado en la memoria `verificador_tipo_de_tienda_colapso` y en Baserow 750
# (cluster Toluca / Metepec / Lerma -> Pilares 85935).
PILARES_HUB_LOC = "uZnMH5bO6MXTHcgHeyq9"  # 85935 - MP - Pilares (hub digital)
DIGITAL_HUB_BY_SHELL = {
    "NSDniGzjxotVDNa5YxqW": PILARES_HUB_LOC,  # 85937 - MP - METEPEC
    "Xqpdy12avIk4NFsOhPBX": PILARES_HUB_LOC,  # 85941 - MP - Grand Plaza
    "pMPs9M4RaGJvWwfIFVIo": PILARES_HUB_LOC,  # 85941 - MP - Grand Plaza Toluca
    "RLAs9sQwbW2DOwzrTMYI": PILARES_HUB_LOC,  # 85939 - MP - Independencia
    "UsHXqoj2l6ND7Uc7sEo2": PILARES_HUB_LOC,  # 85938 - MP - SENDERO
    "lWp7F6rsgTjy3voFBZ1m": PILARES_HUB_LOC,  # 85935 - MP - Lerma
    "clhDZ0hIllKfV0AcgW53": PILARES_HUB_LOC,  # 85940 - MP - Isidro Fabela (0 contactos, NO DIGITAL)
}

DEMO_PATTERN = re.compile(r"\bdemo\b", re.IGNORECASE)


# ---------------------------------------------------------------------------
# Utilidades
# ---------------------------------------------------------------------------

def safe_print(*args, **kwargs):
    sep = kwargs.get("sep", " ")
    end = kwargs.get("end", "\n")
    text = sep.join(str(a) for a in args)
    encoding = sys.stdout.encoding or "utf-8"
    try:
        sys.stdout.write(text + end)
        sys.stdout.flush()
    except UnicodeEncodeError:
        sys.stdout.write(text.encode(encoding, errors="replace").decode(encoding) + end)
        sys.stdout.flush()


def strip_accents(value):
    if not value:
        return ""
    nfkd = unicodedata.normalize("NFD", str(value))
    return "".join(c for c in nfkd if unicodedata.category(c) != "Mn")


def normalize_phone(phone, last_n=10):
    digits = re.sub(r"\D+", "", str(phone or ""))
    return digits[-last_n:] if len(digits) >= last_n else digits


def normalize_email(email):
    return str(email or "").strip().lower()


def normalize_tienda(value):
    """Normaliza nombres de tienda para matching: sin acentos, mayusculas, espacios colapsados."""
    if not value:
        return ""
    return " ".join(strip_accents(str(value)).upper().split())


def is_demo_account(name):
    return bool(name and DEMO_PATTERN.search(name))


# Detecta contactos que parecen ser de prueba: keywords sueltas (rodeadas de
# separadores no alfanumÃ©ricos) en cualquiera de los campos basicos. Quiero
# que matchee 'Juan Prueba', 'test@test.com', '+52 prueba', pero NO falsos
# positivos como 'Pruebal' o 'Contestino'.
TEST_KEYWORDS_PATTERN = re.compile(
    r"(?:^|[^a-z0-9])(test|testing|prueba|pruebas)(?:$|[^a-z0-9])",
    re.IGNORECASE,
)


def looks_like_test_contact(c):
    """True si el nombre, email o telÃ©fono contienen keywords de prueba."""
    haystack_parts = [
        c.get("first_name") or "",
        c.get("last_name") or "",
        c.get("email") or "",
        c.get("phone") or "",
    ]
    haystack = " ".join(strip_accents(p).lower() for p in haystack_parts if p)
    if not haystack:
        return False
    return bool(TEST_KEYWORDS_PATTERN.search(haystack))


def fmt_contact(c):
    name = f"{c.get('first_name') or ''} {c.get('last_name') or ''}".strip() or "Sin nombre"
    return {
        "id": c.get("id"),
        "name": name,
        "phone": c.get("phone") or "",
        "email": c.get("email") or "",
    }


# ---------------------------------------------------------------------------
# Carga de datos
# ---------------------------------------------------------------------------

def load_verifier():
    """{location_id: {tienda_norm, sucursal_label}, plus reverse index tienda_norm -> location_id}."""
    by_location = {}
    by_tienda = {}
    if not os.path.exists(VERIFIER_CSV):
        return by_location, by_tienda
    with open(VERIFIER_CSV, encoding="utf-8-sig", newline="") as fh:
        for row in csv.DictReader(fh):
            loc = (row.get("ID LOCATION BUCEFALO") or "").strip()
            if not loc:
                continue
            tienda_raw = (row.get("TIENDA") or "").strip()
            sucursal_raw = (row.get("SUCURSAL") or "").strip()
            tienda_norm = normalize_tienda(tienda_raw) if tienda_raw and tienda_raw != "-" else None
            by_location[loc] = {
                "tienda_raw": tienda_raw,
                "tienda_norm": tienda_norm,
                "sucursal": sucursal_raw if sucursal_raw and sucursal_raw != "-" else None,
            }
            if tienda_norm and loc != BRAND_LOCATION_ID:
                # Una tienda puede aparecer en varias filas del verificador, pero
                # el ID LOCATION BUCEFALO es el discriminante. Si dos sucursales
                # comparten TIENDA en el CSV, nos quedamos con la primera.
                by_tienda.setdefault(tienda_norm, loc)
    return by_location, by_tienda


def resolve_tienda_field_id(conn, location_id):
    """Lee object_schemas para encontrar el field id de 'TIENDA' en una location."""
    rows = conn.execute(
        "SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'",
        (location_id,),
    ).fetchall()
    for r in rows:
        if strip_accents(r["field_name"]).lower().strip() == "tienda":
            return r["field_id"]
    return None


def extract_tienda_from_custom_fields(custom_fields_json, field_id):
    if not field_id or not custom_fields_json:
        return None
    try:
        cfs = json.loads(custom_fields_json)
    except Exception:
        return None
    if not isinstance(cfs, list):
        return None
    for cf in cfs:
        if cf.get("id") == field_id or cf.get("fieldId") == field_id:
            for key in ("value", "fieldValue", "fieldValueString"):
                v = cf.get(key)
                if v is not None and v != "":
                    return v
    return None


def resolve_sucursal_field_id(conn, location_id):
    """Field id del CF 'Sucursal' en la location dada."""
    rows = conn.execute(
        "SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'",
        (location_id,),
    ).fetchall()
    for r in rows:
        if strip_accents(r["field_name"]).lower().strip() == "sucursal":
            return r["field_id"]
    return None


# Campo de oportunidad que vincula Sucursal<->Marca (llave determinística).
OPP_LINK_FIELD_KEY = "opportunity.id_oportunidad_sucursal"
OPP_LINK_FIELD_NAME = "id oportunidad sucursal"
# Los ids nativos de GHL son exactamente 20 chars alfanuméricos.
OPP_ID_PATTERN = re.compile(r"^[A-Za-z0-9]{20}$")


def resolve_opp_link_field_id(conn, location_id):
    """Field id del CF 'ID Oportunidad Sucursal' (opportunity) en una location."""
    rows = conn.execute(
        "SELECT field_id, field_name, field_key FROM object_schemas "
        "WHERE location_id=? AND object_key='opportunity'",
        (location_id,),
    ).fetchall()
    for r in rows:
        if r["field_key"] == OPP_LINK_FIELD_KEY:
            return r["field_id"]
    for r in rows:
        if strip_accents(r["field_name"]).lower().strip() == OPP_LINK_FIELD_NAME:
            return r["field_id"]
    return None


def extract_opp_link_value(custom_fields_json, field_id):
    """Valor del CF 'ID Oportunidad Sucursal' en una opp (None si vacío/ausente)."""
    if not field_id or not custom_fields_json:
        return None
    try:
        cfs = json.loads(custom_fields_json)
    except Exception:
        return None
    if not isinstance(cfs, list):
        return None
    for cf in cfs:
        if cf.get("id") == field_id or cf.get("fieldId") == field_id:
            for key in ("value", "fieldValue", "fieldValueString"):
                v = cf.get(key)
                if v is not None and v != "":
                    return v
    return None


# Campo de contacto que vincula Sucursal<->Marca (llave determinística, paralelo a opp).
CONTACT_LINK_FIELD_KEY = "contact.id_contacto_sucursal"
CONTACT_LINK_FIELD_NAME = "id contacto sucursal"


def resolve_contact_link_field_id(conn, location_id):
    """Field id del CF 'ID Contacto Sucursal' (contact) en una location."""
    rows = conn.execute(
        "SELECT field_id, field_name, field_key FROM object_schemas "
        "WHERE location_id=? AND object_key='contact'",
        (location_id,),
    ).fetchall()
    for r in rows:
        if r["field_key"] == CONTACT_LINK_FIELD_KEY:
            return r["field_id"]
    for r in rows:
        if strip_accents(r["field_name"]).lower().strip() == CONTACT_LINK_FIELD_NAME:
            return r["field_id"]
    return None


def extract_contact_link_value(custom_fields_json, field_id):
    """Valor del CF 'ID Contacto Sucursal' en un contacto (None si vacío/ausente)."""
    if not field_id or not custom_fields_json:
        return None
    try:
        cfs = json.loads(custom_fields_json)
    except Exception:
        return None
    if not isinstance(cfs, list):
        return None
    for cf in cfs:
        if cf.get("id") == field_id or cf.get("fieldId") == field_id:
            for key in ("value", "fieldValue", "fieldValueString"):
                v = cf.get(key)
                if v is not None and v != "":
                    return v
    return None


# Patrones de normalizacion para matching tolerante de "Sucursal".
# Convierten abreviaturas comunes a su forma canonica antes de comparar.
SUCURSAL_ABBREV_PATTERNS = [
    (re.compile(r"\bedo\.?\s*de\s*mex\.?\b"), "estado de mexico"),
    (re.compile(r"\bedo\.?\s*mex\.?\b"), "estado de mexico"),
    (re.compile(r"\bedomex\b"), "estado de mexico"),
    (re.compile(r"\bedo\b"), "estado de mexico"),
    (re.compile(r"\bcdmx\b"), "ciudad de mexico"),
    (re.compile(r"\bd\.?\s*f\.?\b"), "ciudad de mexico"),
    # "Cd." / "Cd " como prefijo de nombre propio (Cd. SatÃ©lite, Cd JuÃ¡rez).
    # Va despuÃ©s de \bcdmx\b para que "cdmx" no se rompa.
    (re.compile(r"\bcd\.?\s+"), "ciudad "),
    (re.compile(r"\bn\.?\s*l\.?\b"), "nuevo leon"),
    (re.compile(r"\bqro\.?\b"), "queretaro"),
    (re.compile(r"\bpue\.?\b"), "puebla"),
    (re.compile(r"\bgto\.?\b"), "guanajuato"),
    (re.compile(r"\bmich\.?\b"), "michoacan"),
    (re.compile(r"\bmor\.?\b"), "morelos"),
    (re.compile(r"\boax\.?\b"), "oaxaca"),
    (re.compile(r"\bgro\.?\b"), "guerrero"),
    (re.compile(r"\bhgo\.?\b"), "hidalgo"),
    (re.compile(r"\btams\.?\b"), "tamaulipas"),
    (re.compile(r"\btamps\.?\b"), "tamaulipas"),
    (re.compile(r"\bchis\.?\b"), "chiapas"),
    (re.compile(r"\bcamp\.?\b"), "campeche"),
    (re.compile(r"\bq\.?\s*roo\b"), "quintana roo"),
    (re.compile(r"\bcoah\.?\b"), "coahuila"),
]


def normalize_sucursal_value(value):
    """Normaliza Sucursal para matching: sin acentos, lowercase, sin puntos/comas,
    abreviaturas estatales expandidas, espacios colapsados.
    """
    if not value:
        return ""
    s = strip_accents(str(value)).lower()
    s = re.sub(r"[.,]", " ", s)
    s = " ".join(s.split())
    for pat, rep in SUCURSAL_ABBREV_PATTERNS:
        s = pat.sub(rep, s)
    return " ".join(s.split())


def resolve_location_from_sucursal(sucursal_value, verifier_by_loc):
    """Intenta mapear el valor del CF Sucursal a un location_id usando matching
    tolerante contra la columna SUCURSAL del verificador.

    Devuelve (location_id, match_kind) o (None, None). match_kind es
    "exact" o "substring" para que la UI pueda mostrar confianza.
    """
    if not sucursal_value:
        return None, None
    target = normalize_sucursal_value(sucursal_value)
    if not target:
        return None, None

    exact_hit = None
    substring_hits = []  # (loc_id, longitud_diferencia)
    for loc_id, info in verifier_by_loc.items():
        if loc_id == BRAND_LOCATION_ID:
            continue
        ver_sucursal = info.get("sucursal")
        if not ver_sucursal:
            continue
        ver_norm = normalize_sucursal_value(ver_sucursal)
        if not ver_norm:
            continue
        if target == ver_norm:
            exact_hit = loc_id
            break
        # Match tipo substring bidireccional: uno contiene al otro como subcadena.
        if target in ver_norm or ver_norm in target:
            substring_hits.append((loc_id, abs(len(ver_norm) - len(target))))

    if exact_hit:
        return exact_hit, "exact"
    if len(substring_hits) == 1:
        return substring_hits[0][0], "substring"
    if len(substring_hits) > 1:
        # Varias coincidencias: nos quedamos con la mas cercana en longitud
        # solo si es claramente mejor que el segundo lugar.
        substring_hits.sort(key=lambda x: x[1])
        if len(substring_hits) >= 2 and substring_hits[1][1] - substring_hits[0][1] <= 1:
            # Empate cerrado: ambiguo, no resolver.
            return None, None
        return substring_hits[0][0], "substring"
    return None, None


def load_accounts_filtered(conn):
    """Devuelve {brand: dict, branches: [dicts], demos: [dicts]}."""
    rows = conn.execute("SELECT location_id, nombre, type FROM accounts").fetchall()
    brand = None
    branches = []
    demos = []
    for r in rows:
        item = dict(r)
        if is_demo_account(item["nombre"]):
            demos.append(item)
            continue
        if item["location_id"] == BRAND_LOCATION_ID:
            brand = item
        else:
            branches.append(item)
    return brand, branches, demos


def load_contacts(conn, location_id):
    return [
        dict(r)
        for r in conn.execute(
            "SELECT id, first_name, last_name, phone, email, custom_fields_json, date_added "
            "FROM contacts WHERE location_id = ?",
            (location_id,),
        ).fetchall()
    ]


def load_opps(conn, location_id):
    return [
        dict(r)
        for r in conn.execute(
            "SELECT id, contact_id, status, name, pipeline_id, monetary_value, custom_fields_json "
            "FROM opportunities WHERE location_id = ?",
            (location_id,),
        ).fetchall()
    ]


# ---------------------------------------------------------------------------
# Matching
# ---------------------------------------------------------------------------

def _contact_full_name_norm(c):
    full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
    return " ".join(strip_accents(full).lower().split())


def build_contact_index(contacts):
    """Indices para matching de contactos.

    Devuelve (by_phone, by_email, by_name). El index by_name contiene TODOS
    los contactos del target (con o sin phone/email), porque la regla de
    negocio dice que un contacto en Marca sin phone/email proviene siempre
    de una sucursal â€” y en la sucursal el contacto sÃ puede tener phone/email
    (porque por ahÃ se capturo originalmente vÃa formulario/etc). La condicion
    de seguridad para evitar falsos positivos por homonimos se aplica del lado
    source en find_match: solo se intenta match por nombre cuando el source
    NO tiene phone NI email.
    """
    by_phone = defaultdict(list)
    by_email = defaultdict(list)
    by_name = defaultdict(list)
    for c in contacts:
        p = normalize_phone(c.get("phone"))
        e = normalize_email(c.get("email"))
        if p:
            by_phone[p].append(c)
        if e:
            by_email[e].append(c)
        n = _contact_full_name_norm(c)
        if n:
            by_name[n].append(c)
    return by_phone, by_email, by_name


def find_match(contact, by_phone, by_email, by_name=None,
               return_collisions=False, threshold=MATCH_THRESHOLD):
    """Busca matches en cascada: phone+nombre -> email -> nombre.

    Reglas:
      1. Si el source tiene phone, intenta match por phone PERO sólo cuenta
         como match si además el nombre coincide (vía common.match_contacts)
         con similitud >= threshold. Si el teléfono coincide pero el nombre
         diverge → es una colisión (caso pareja con mismo número) y NO se
         incluye como match; se acumula en `collisions` para reporte.
      2. Si el source tiene email, intenta match por email (sin requerir
         nombre — email es identificador más fuerte y la colisión es muy rara).
      3. Si el source NO tiene phone NI email, intenta match por nombre contra
         todos los contactos del target.

    Args:
        return_collisions: si True, devuelve (matches, collisions). Si False
            (default, back-compat), devuelve sólo matches.
    """
    p = normalize_phone(contact.get("phone"))
    e = normalize_email(contact.get("email"))
    seen, matches, collisions = set(), [], []
    # IDs marcados como colisión por phone: NO deben ser luego "rescatados"
    # como match por email — la divergencia de nombre invalida el grupo
    # incluso si email coincide (suele indicar datos confundidos por la
    # integración, no la misma persona).
    phone_collision_ids = set()
    if p and p in by_phone:
        for m in by_phone[p]:
            if m["id"] in seen:
                continue
            result = _match_contacts(contact, m, threshold=threshold)
            if result["level"] in ("strong", "medium"):
                matches.append(m)
                seen.add(m["id"])
            else:
                collisions.append(m)
                phone_collision_ids.add(m["id"])
    if e and e in by_email:
        for m in by_email[e]:
            if m["id"] in seen or m["id"] in phone_collision_ids:
                continue
            matches.append(m)
            seen.add(m["id"])
    if by_name is not None and not p and not e:
        n = _contact_full_name_norm(contact)
        if n and n in by_name:
            for m in by_name[n]:
                if m["id"] not in seen:
                    matches.append(m)
                    seen.add(m["id"])
    if return_collisions:
        return matches, collisions
    return matches


# ---------------------------------------------------------------------------
# Fuzzy matching (modalidades permisivas) â€” usado para detectar "posibles
# coincidencias" en otras sucursales cuando el match estricto no las encuentra.
# Output puramente informativo: NO mueve contactos a otro bucket, solo se
# adjunta como advertencia a cada item para que el operador decida.
# ---------------------------------------------------------------------------

_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}


def normalize_phone_partial(phone, last_n=7):
    """Phone normalizado a los ultimos N digitos (default 7)."""
    digits = re.sub(r"\D+", "", str(phone or ""))
    return digits[-last_n:] if len(digits) >= last_n else ""


def email_local_part(email):
    """Parte local del email (antes del @), normalizada a lowercase."""
    e = normalize_email(email)
    if "@" not in e:
        return ""
    return e.split("@", 1)[0]


def email_canonical(email):
    """Email canonico: lowercase + strip + remueve '+alias' y, para gmail,
    elimina puntos en la parte local. Permite detectar el mismo email aunque
    haya sido escrito con variaciones (juan.perez+spam@gmail = juanperez@gmail).
    """
    e = normalize_email(email)
    if "@" not in e:
        return ""
    local, _, domain = e.partition("@")
    if "+" in local:
        local = local.split("+", 1)[0]
    if domain in _GMAIL_DOMAINS:
        local = local.replace(".", "")
        domain = "gmail.com"  # unifica googlemail con gmail
    return f"{local}@{domain}" if local else ""


def first_last_norm(c):
    """Devuelve 'primer_token_first_name ultimo_token_last_name' normalizado.

    Tolera divisiones inconsistentes en los CRMs:
      - 'Juan Pablo' en first_name + 'Franco' en last_name -> 'juan franco'
      - 'Juan' en first_name + 'Pablo Franco' en last_name -> 'juan franco'
      - 'Juan' en first_name + 'Franco Gutierrez' (paterno+materno) -> 'juan gutierrez'

    Esto evita el falso positivo clasico de usar el primer token del apellido,
    que colisiona cuando el "primer apellido" es en realidad un middle name
    (p.ej. 'juan pablo franco' vs 'juan pablo jimenez' tendrian la misma key
    'juan pablo' si tomaramos primer-first + primer-last). Tomando el ultimo
    token del apellido el match exige coincidencia en el apellido real.
    """
    fn = (c.get("first_name") or "").strip()
    ln = (c.get("last_name") or "").strip()
    if not fn and not ln:
        return ""
    first_tok = strip_accents(fn).lower().split()
    last_tok = strip_accents(ln).lower().split()
    first = first_tok[0] if first_tok else ""
    last = last_tok[-1] if last_tok else ""
    if not first or not last:
        return ""
    return f"{first} {last}"


def build_fuzzy_indexes(contacts):
    """Construye indices fuzzy adicionales. Cada lista guarda dicts ligeros con
    id + meta (location_id se inyecta antes, ver _augment_global_contacts).
    """
    by_phone_partial = defaultdict(list)
    by_email_local = defaultdict(list)
    by_email_canon = defaultdict(list)
    by_first_last = defaultdict(list)
    for c in contacts:
        pp = normalize_phone_partial(c.get("phone"))
        if pp:
            by_phone_partial[pp].append(c)
        el = email_local_part(c.get("email"))
        if el:
            by_email_local[el].append(c)
        ec = email_canonical(c.get("email"))
        if ec:
            by_email_canon[ec].append(c)
        fl = first_last_norm(c)
        if fl:
            by_first_last[fl].append(c)
    return {
        "phone_partial": by_phone_partial,
        "email_local": by_email_local,
        "email_canonical": by_email_canon,
        "first_last": by_first_last,
    }


def find_fuzzy_matches(contact, fuzzy_indexes, exclude_ids=None, strict_match_phone=None, strict_match_email=None):
    """Devuelve lista de dicts {id, location_id, location_name, strategy, ...}
    con coincidencias fuzzy en OTRAS sucursales. Excluye explicitamente IDs
    que ya fueron match estricto (para evitar duplicar la advertencia).

    Args:
        contact: dict del contacto fuente (Marca).
        fuzzy_indexes: dict producido por build_fuzzy_indexes.
        exclude_ids: set opcional de IDs a omitir (matches estrictos ya contados).
        strict_match_phone: phone normalizado del source â€” para no incluir
            coincidencias fuzzy que en realidad son match estricto por phone.
        strict_match_email: email normalizado del source â€” idem para email.

    Las estrategias se ordenan por confianza descendente. Si un mismo contacto
    matchea por mas de una estrategia, se conserva la mas fuerte.
    """
    exclude_ids = set(exclude_ids or [])
    # confianza/etiqueta legible
    STRATEGY_LABELS = {
        "phone_partial": ("TelÃ©fono parcial (Ãºlt. 7 dÃgitos)", 90),
        "email_canonical": ("Email canÃ³nico (gmail sin puntos/alias)", 95),
        "email_local": ("Misma parte local del email (otro dominio)", 70),
        "first_last": ("Mismo nombre + primer apellido", 65),
    }
    candidates = {}  # id -> {dict, score, strategy}

    def _add(strategy, ms):
        label, score = STRATEGY_LABELS[strategy]
        for m in ms:
            mid = m.get("id")
            if not mid or mid in exclude_ids:
                continue
            prev = candidates.get(mid)
            if prev is None or score > prev["score"]:
                candidates[mid] = {"m": m, "score": score, "strategy": strategy, "strategy_label": label}

    # phone parcial: solo si el source tiene phone con >=7 digitos.
    src_pp = normalize_phone_partial(contact.get("phone"))
    if src_pp:
        ms = fuzzy_indexes["phone_partial"].get(src_pp, [])
        # evitar contar matches que YA son estrictos por phone (mismos ultimos 10).
        if strict_match_phone:
            ms = [m for m in ms if normalize_phone(m.get("phone")) != strict_match_phone]
        _add("phone_partial", ms)

    # email canonico: si el source tiene email con dominio.
    src_ec = email_canonical(contact.get("email"))
    src_email_norm = normalize_email(contact.get("email"))
    if src_ec:
        ms = fuzzy_indexes["email_canonical"].get(src_ec, [])
        if strict_match_email:
            ms = [m for m in ms if normalize_email(m.get("email")) != strict_match_email]
        _add("email_canonical", ms)

    # mismo local part en otro dominio.
    src_el = email_local_part(contact.get("email"))
    if src_el:
        ms = fuzzy_indexes["email_local"].get(src_el, [])
        # excluir matches que ya tienen email idÃ©ntico (los cubre email_canonical/strict).
        ms = [m for m in ms if normalize_email(m.get("email")) != src_email_norm]
        _add("email_local", ms)

    # mismo first_name + primer apellido.
    src_fl = first_last_norm(contact)
    if src_fl:
        _add("first_last", fuzzy_indexes["first_last"].get(src_fl, []))

    out = []
    for cid, info in candidates.items():
        m = info["m"]
        out.append({
            "id": cid,
            "location_id": m.get("_loc"),
            "location_name": m.get("_loc_name"),
            "first_name": m.get("first_name"),
            "last_name": m.get("last_name"),
            "phone": m.get("phone"),
            "email": m.get("email"),
            "strategy": info["strategy"],
            "strategy_label": info["strategy_label"],
            "score": info["score"],
        })
    out.sort(key=lambda x: (-x["score"], x.get("location_name") or ""))
    return out


# ---------------------------------------------------------------------------
# Auditoria principal
# ---------------------------------------------------------------------------

def run_audit(limit_missing=None):
    """Ejecuta la comparativa y devuelve un dict JSON-serializable.

    Args:
        limit_missing: si es int, recorta cada listado de ausentes a N items
            (para respuestas API mas ligeras). None = sin recorte.
    """
    if not os.path.exists(DB_PATH):
        raise FileNotFoundError(
            f"No existe {DB_PATH}. Corre una sincronizacion global primero."
        )

    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row

    try:
        brand, branches, demos = load_accounts_filtered(conn)
        if not brand:
            raise RuntimeError(
                "No se encontro la cuenta de Marca en la tabla accounts. "
                "Corre la sincronizacion para poblar el catalogo."
            )

        verifier_by_loc, verifier_by_tienda = load_verifier()
        brand_tienda_field_id = resolve_tienda_field_id(conn, BRAND_LOCATION_ID)
        brand_sucursal_field_id = resolve_sucursal_field_id(conn, BRAND_LOCATION_ID)
        brand_opp_link_field_id = resolve_opp_link_field_id(conn, BRAND_LOCATION_ID)
        brand_contact_link_field_id = resolve_contact_link_field_id(conn, BRAND_LOCATION_ID)

        brand_contacts = load_contacts(conn, BRAND_LOCATION_ID)
        brand_opps = load_opps(conn, BRAND_LOCATION_ID)

        branch_data = {}
        total_branch_contacts = 0
        total_branch_opps = 0
        per_branch_summary = []

        for b in branches:
            loc = b["location_id"]
            bc = load_contacts(conn, loc)
            bo = load_opps(conn, loc)
            branch_data[loc] = {"contacts": bc, "opps": bo, "name": b["nombre"]}
            total_branch_contacts += len(bc)
            total_branch_opps += len(bo)

        # ---- Indices globales de sucursal (para buscar contraparte de Marca) ----
        all_branch_contacts = []
        for loc, data in branch_data.items():
            for c in data["contacts"]:
                aug = dict(c)
                aug["_loc"] = loc
                aug["_loc_name"] = data["name"]
                all_branch_contacts.append(aug)

        branch_idx_phone, branch_idx_email, branch_idx_name = build_contact_index(all_branch_contacts)

        # Indices fuzzy globales (para detectar "posibles coincidencias" en
        # otras sucursales cuando el match estricto no las encuentra). Solo
        # se consultan para los items del bucket missing_in_assigned_branch.
        branch_fuzzy_indexes = build_fuzzy_indexes(all_branch_contacts)

        # Indice por sucursal individual (para verificar si el contacto Marca esta
        # en la sucursal especifica que le toca segun el verificador).
        per_branch_idx = {}
        for loc, data in branch_data.items():
            per_branch_idx[loc] = build_contact_index(data["contacts"])

        # Opps por contact_id en cada sucursal y en Marca.
        brand_opps_by_cid = defaultdict(list)
        for o in brand_opps:
            brand_opps_by_cid[o["contact_id"]].append(o)

        # Índice de opps de Marca por el valor del campo "ID Oportunidad Sucursal"
        # (= id nativo de la opp de sucursal de origen). Es la llave determinística
        # para el match por campo (criterio principal del bucket de opps faltantes).
        brand_opps_by_link = {}
        for o in brand_opps:
            lv = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
            if lv:
                brand_opps_by_link.setdefault(lv, o)

        # Índice de contactos de Marca por el valor del campo "ID Contacto Sucursal"
        # (= id nativo del contacto de sucursal de origen). Llave determinística
        # para match por campo en el bucket contacts_in_branch_not_in_brand.
        brand_contacts_by_link = {}
        for c in brand_contacts:
            lv = extract_contact_link_value(c.get("custom_fields_json"), brand_contact_link_field_id)
            if lv:
                brand_contacts_by_link.setdefault(lv, c)

        per_branch_opps_by_cid = {}
        for loc, data in branch_data.items():
            grouped = defaultdict(list)
            for o in data["opps"]:
                grouped[o.get("contact_id")].append(o)
            per_branch_opps_by_cid[loc] = grouped

        # ----------------------------------------------------------------------
        # 1) Contactos en sucursal sin contraparte en Marca
        # ----------------------------------------------------------------------
        brand_idx_phone, brand_idx_email, brand_idx_name = build_contact_index(brand_contacts)

        missing_in_brand = []  # contactos sucursal que no estan en Marca
        for loc, data in branch_data.items():
            for c in data["contacts"]:
                # Criterio PRINCIPAL: match por el campo "ID Contacto Sucursal".
                # Si existe un contacto Marca cuyo valor de ese campo == id nativo
                # de este contacto de sucursal, está replicado. Determinístico.
                if c.get("id") in brand_contacts_by_link:
                    continue
                # Respaldo: lógica histórica por phone/email/name.
                if find_match(c, brand_idx_phone, brand_idx_email, brand_idx_name):
                    continue
                opps_here = per_branch_opps_by_cid[loc].get(c["id"], [])
                missing_in_brand.append({
                    **fmt_contact(c),
                    "branch_location_id": loc,
                    "branch_name": data["name"],
                    "opps_in_branch": len(opps_here),
                })

        # ----------------------------------------------------------------------
        # 2) Contactos en Marca que no estan en la sucursal que les corresponde
        #    por el verificador (TIENDA del contacto -> location_id).
        # ----------------------------------------------------------------------
        missing_in_assigned_branch = []  # NO esta en ninguna sucursal (incluye la asignada). Tiene TIENDA.
        present_in_other_branch_not_assigned = []  # Esta en OTRA sucursal, no la asignada.
        probable_duplicate_in_brand = []  # Marca tiene phone/email pero hay homÃ³nimo con phone/email en la sucursal asignada -> probable duplicado en Marca.
        brand_without_tienda = []         # contactos Marca sin TIENDA poblada
        brand_with_unknown_tienda = []    # contactos Marca con TIENDA que no matchea ninguna fila del verificador
        brand_present_in_any_branch = 0
        brand_not_in_any_branch = []      # contacto Marca que no aparece en ninguna sucursal (no asignable)

        # Pre-indexar nombres por sucursal para encontrar el candidato exacto en la
        # sucursal asignada cuando el contacto Marca no matchea por phone/email.
        per_branch_name_idx = {}
        per_branch_by_id = {}
        for loc, data in branch_data.items():
            name_idx = {}
            by_id = {}
            for bc in data["contacts"]:
                by_id[bc["id"]] = bc
                full = f"{bc.get('first_name') or ''} {bc.get('last_name') or ''}"
                nm = " ".join(strip_accents(full).lower().split())
                if nm:
                    name_idx.setdefault(nm, []).append(bc)
            per_branch_name_idx[loc] = name_idx
            per_branch_by_id[loc] = by_id

        def _enrich_other_branches(global_matches_list):
            """Devuelve other_branches con id + tienda del verificador."""
            out = []
            for m in global_matches_list[:5]:
                loc_id = m["_loc"]
                vinfo = verifier_by_loc.get(loc_id) or {}
                out.append({
                    "location_id": loc_id,
                    "name": m.get("_loc_name") or branch_data.get(loc_id, {}).get("name"),
                    "id": m["id"],
                    "tienda_value": vinfo.get("tienda_raw"),
                })
            return out

        for c in brand_contacts:
            tienda_value = extract_tienda_from_custom_fields(
                c.get("custom_fields_json"), brand_tienda_field_id
            )
            sucursal_value = extract_tienda_from_custom_fields(
                c.get("custom_fields_json"), brand_sucursal_field_id
            )
            tienda_norm = normalize_tienda(tienda_value) if tienda_value else None
            target_loc = verifier_by_tienda.get(tienda_norm) if tienda_norm else None
            # Resolution source: "tienda" | "sucursal_exact" | "sucursal_substring" | None
            target_loc_source = "tienda" if target_loc else None
            target_loc_match_kind = None
            # Segundo check: si TIENDA no resolvio, intentar mapear via Sucursal
            # con matching tolerante (abreviaturas + substring).
            if not target_loc and sucursal_value:
                fb_loc, fb_kind = resolve_location_from_sucursal(sucursal_value, verifier_by_loc)
                if fb_loc:
                    target_loc = fb_loc
                    target_loc_source = f"sucursal_{fb_kind}"
                    target_loc_match_kind = fb_kind

            # Match global contra todas las sucursales (informativo)
            global_matches = find_match(c, branch_idx_phone, branch_idx_email, branch_idx_name)
            if global_matches:
                brand_present_in_any_branch += 1
            else:
                brand_not_in_any_branch.append({
                    **fmt_contact(c),
                    "tienda": tienda_value,
                    "sucursal": sucursal_value,
                    "expected_location_id": target_loc,
                    "expected_branch_name": branch_data.get(target_loc, {}).get("name") if target_loc else None,
                    "resolution_source": target_loc_source,
                    "opps_in_brand": len(brand_opps_by_cid.get(c["id"], [])),
                })

            if not tienda_value:
                # Enriquecer con datos del CF Sucursal para que el dashboard pueda
                # ofrecer "Llenar TIENDA desde Sucursal". Solo es resoluble si
                # Sucursal mapea a una sucursal del verificador con TIENDA poblada.
                sucursal_resolved_loc = None
                sucursal_resolved_kind = None
                expected_tienda = None
                expected_branch_name = None
                if sucursal_value:
                    fb_loc, fb_kind = resolve_location_from_sucursal(
                        sucursal_value, verifier_by_loc
                    )
                    if fb_loc:
                        sucursal_resolved_loc = fb_loc
                        sucursal_resolved_kind = fb_kind
                        vinfo = verifier_by_loc.get(fb_loc) or {}
                        expected_tienda = vinfo.get("tienda_raw")
                        expected_branch_name = branch_data.get(fb_loc, {}).get("name")
                brand_without_tienda.append({
                    **fmt_contact(c),
                    "sucursal": sucursal_value or "",
                    "sucursal_resolved_location_id": sucursal_resolved_loc,
                    "sucursal_resolution_kind": sucursal_resolved_kind,
                    "expected_tienda": expected_tienda,
                    "expected_branch_name": expected_branch_name,
                    "looks_like_test": looks_like_test_contact(c),
                })
                continue

            if not target_loc:
                brand_with_unknown_tienda.append({
                    **fmt_contact(c),
                    "tienda": tienda_value,
                })
                continue

            # Buscar en la sucursal asignada
            idx = per_branch_idx.get(target_loc)
            if not idx:
                # La sucursal asignada esta filtrada por demo o no esta cacheada.
                continue
            branch_phone_idx, branch_email_idx, branch_name_idx_loc = idx
            in_assigned = find_match(c, branch_phone_idx, branch_email_idx, branch_name_idx_loc)
            if not in_assigned:
                # Si la sucursal asignada es una "shell" absorbida por un hub
                # digital (Toluca/Metepec/Lerma -> Pilares), aceptar la presencia
                # en el hub como correcta. Evita ~82 falsos positivos.
                hub_loc = DIGITAL_HUB_BY_SHELL.get(target_loc)
                hub_idx = per_branch_idx.get(hub_loc) if hub_loc else None
                if hub_idx and find_match(c, hub_idx[0], hub_idx[1], hub_idx[2]):
                    in_assigned = True
            if in_assigned:
                continue  # Esta donde corresponde, sin discrepancia.

            opps_in_brand = len(brand_opps_by_cid.get(c["id"], []))
            other_branches_enriched = _enrich_other_branches(global_matches)

            if global_matches:
                # Caso B: esta en OTRA sucursal (no la asignada).
                present_in_other_branch_not_assigned.append({
                    **fmt_contact(c),
                    "tienda": tienda_value,
                    "expected_location_id": target_loc,
                    "expected_branch_name": branch_data[target_loc]["name"],
                    "opps_in_brand": opps_in_brand,
                    "other_branches": other_branches_enriched,
                })
            else:
                # Caso A o Caso D: el contacto Marca no matcheÃ³ con NINGUNA
                # sucursal por phone/email. Buscar homÃ³nimos exactos por nombre
                # en la sucursal asignada para decidir el sub-caso.
                brand_full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
                brand_name_norm = " ".join(strip_accents(brand_full).lower().split())
                candidates_by_name = []
                if brand_name_norm:
                    candidates_by_name = per_branch_name_idx.get(target_loc, {}).get(brand_name_norm, []) or []

                # Caso D: probable duplicado en Marca. El contacto Marca SÃ tiene
                # phone/email, no matcheo en ninguna sucursal, pero hay un homÃ³nimo
                # con identificadores fuertes en la sucursal asignada. Lo mÃ¡s
                # probable: en sucursal estÃ¡ el contacto bueno y en Marca quedÃ³
                # un registro extra con otro nÃºmero/email.
                brand_phone_norm = normalize_phone(c.get("phone"))
                brand_email_norm = normalize_email(c.get("email"))
                brand_has_strong_id = bool(brand_phone_norm or brand_email_norm)
                candidate_with_strong_id = None
                for cand in candidates_by_name:
                    if normalize_phone(cand.get("phone")) or normalize_email(cand.get("email")):
                        candidate_with_strong_id = cand
                        break

                if brand_has_strong_id and candidate_with_strong_id:
                    probable_duplicate_in_brand.append({
                        **fmt_contact(c),
                        "tienda": tienda_value,
                        "expected_location_id": target_loc,
                        "expected_branch_name": branch_data[target_loc]["name"],
                        "opps_in_brand": opps_in_brand,
                        "branch_existing_contact": {
                            "id": candidate_with_strong_id["id"],
                            "phone": candidate_with_strong_id.get("phone"),
                            "email": candidate_with_strong_id.get("email"),
                            "first_name": candidate_with_strong_id.get("first_name"),
                            "last_name": candidate_with_strong_id.get("last_name"),
                        },
                        "homonyms_in_branch_count": len(candidates_by_name),
                    })
                else:
                    # Caso A: contacto genuinamente ausente. Mantener candidato
                    # por nombre para el botÃ³n de update-branch-from-brand
                    # cuando aplique (caso tÃpico: ambos sin phone/email).
                    target_candidate = None
                    if len(candidates_by_name) == 1:
                        cand = candidates_by_name[0]
                        target_candidate = {
                            "id": cand["id"],
                            "phone": cand.get("phone"),
                            "email": cand.get("email"),
                            "first_name": cand.get("first_name"),
                            "last_name": cand.get("last_name"),
                        }
                    # Fuzzy: explora otras sucursales con modalidades permisivas
                    # (telÃ©fono parcial, email canÃ³nico/local, primer-apellido).
                    # Es puramente informativo â€” el contacto sigue en este bucket.
                    fuzzy = find_fuzzy_matches(
                        c,
                        branch_fuzzy_indexes,
                        exclude_ids=None,  # aquÃ global_matches estÃ¡ vacÃo por definiciÃ³n
                        strict_match_phone=normalize_phone(c.get("phone")),
                        strict_match_email=normalize_email(c.get("email")),
                    )
                    # Enriquece cada match con la TIENDA del verificador para
                    # dar contexto al operador (a quÃ© sucursal apunta cada uno).
                    for fm in fuzzy:
                        loc_id_fm = fm.get("location_id")
                        if loc_id_fm:
                            vinfo = verifier_by_loc.get(loc_id_fm) or {}
                            fm["location_tienda"] = vinfo.get("tienda_raw")
                    missing_in_assigned_branch.append({
                        **fmt_contact(c),
                        "tienda": tienda_value,
                        "expected_location_id": target_loc,
                        "expected_branch_name": branch_data[target_loc]["name"],
                        "opps_in_brand": opps_in_brand,
                        "branch_target_candidate": target_candidate,
                        "branch_target_candidates_count": len(candidates_by_name),
                        "fuzzy_matches": fuzzy,
                    })

        # ----------------------------------------------------------------------
        # 3) Oportunidades sin contraparte
        # ----------------------------------------------------------------------
        # Para cada opp de sucursal, ver si su contacto tiene match en Marca y si
        # ese contact Marca tiene al menos 1 opp. Si no, es opp ausente en Marca.
        missing_opps_in_brand = []
        for loc, data in branch_data.items():
            bc_by_id = {c["id"]: c for c in data["contacts"]}
            for o in data["opps"]:
                # Criterio PRINCIPAL: match por el campo "ID Oportunidad Sucursal".
                # Si existe una opp de Marca cuyo valor de ese campo == el id nativo
                # de esta opp de sucursal, está replicada. Compara cada opp de forma
                # individual -> detecta el gap multi-empeño. Si no hay match por
                # campo, cae al respaldo por contacto (lógica histórica de abajo).
                if o.get("id") in brand_opps_by_link:
                    continue
                contact = bc_by_id.get(o.get("contact_id"))
                if not contact:
                    # opp huerfana sin contacto cacheado en sucursal -> reportar pero como anomaly
                    missing_opps_in_brand.append({
                        "id": o["id"],
                        "name": o.get("name") or "",
                        "status": o.get("status") or "",
                        "monetary_value": o.get("monetary_value") or 0,
                        "branch_location_id": loc,
                        "branch_name": data["name"],
                        "contact_id": o.get("contact_id") or "",
                        "contact_name": "(contacto no cacheado)",
                        "contact_phone": "",
                        "contact_email": "",
                        "reason": "contacto_huerfano",
                    })
                    continue
                marca_matches = find_match(contact, brand_idx_phone, brand_idx_email, brand_idx_name)
                if not marca_matches:
                    missing_opps_in_brand.append({
                        "id": o["id"],
                        "name": o.get("name") or "",
                        "status": o.get("status") or "",
                        "monetary_value": o.get("monetary_value") or 0,
                        "branch_location_id": loc,
                        "branch_name": data["name"],
                        "contact_id": contact["id"],
                        "contact_name": fmt_contact(contact)["name"],
                        "contact_phone": contact.get("phone") or "",
                        "contact_email": contact.get("email") or "",
                        "reason": "contacto_no_en_marca",
                    })
                    continue
                # contacto SI esta en Marca, ver si tiene opps replicadas
                has_brand_opp = any(brand_opps_by_cid.get(m["id"]) for m in marca_matches)
                if not has_brand_opp:
                    missing_opps_in_brand.append({
                        "id": o["id"],
                        "name": o.get("name") or "",
                        "status": o.get("status") or "",
                        "monetary_value": o.get("monetary_value") or 0,
                        "branch_location_id": loc,
                        "branch_name": data["name"],
                        "contact_id": contact["id"],
                        "contact_name": fmt_contact(contact)["name"],
                        "contact_phone": contact.get("phone") or "",
                        "contact_email": contact.get("email") or "",
                        "reason": "opp_no_replicada",
                    })

        # ----------------------------------------------------------------------
        # 3b) Oportunidades con el campo "ID Oportunidad Sucursal" vacío o inválido
        # ----------------------------------------------------------------------
        # El valor debe ser el id nativo de la opp (20 chars alfanuméricos). Vacío
        # o len != 20 => inválido. Sucursales son accionables (botón de llenado =
        # su propio id); Marca es informativo (su campo se resuelve por matcheo/sync).
        opps_missing_id_field = []

        def _classify_link(value):
            if not value:
                return "vacio"
            if not OPP_ID_PATTERN.match(str(value)):
                return "longitud_invalida"
            return None

        for o in brand_opps:
            v = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
            reason = _classify_link(v)
            if reason is None:
                continue
            opps_missing_id_field.append({
                "id": o["id"],
                "name": o.get("name") or "",
                "status": o.get("status") or "",
                "location_id": BRAND_LOCATION_ID,
                "location_name": brand["nombre"],
                "is_brand": True,
                "field_value": v or "",
                "field_len": len(str(v)) if v else 0,
                "reason": reason,
            })

        for loc, data in branch_data.items():
            branch_link_fid = resolve_opp_link_field_id(conn, loc)
            for o in data["opps"]:
                v = extract_opp_link_value(o.get("custom_fields_json"), branch_link_fid)
                reason = _classify_link(v)
                if reason is None:
                    continue
                opps_missing_id_field.append({
                    "id": o["id"],
                    "name": o.get("name") or "",
                    "status": o.get("status") or "",
                    "location_id": loc,
                    "location_name": data["name"],
                    "is_brand": False,
                    "field_value": v or "",
                    "field_len": len(str(v)) if v else 0,
                    "reason": reason,
                })

        # ----------------------------------------------------------------------
        # 3b-bis) Réplicas DUPLICADAS en Marca (mismo "ID Oportunidad Sucursal")
        # ----------------------------------------------------------------------
        # Descuadre POSITIVO (Marca > sucursales): si dos o más opps de Marca
        # comparten el MISMO valor de "ID Oportunidad Sucursal" (= apuntan a la
        # misma opp de sucursal de origen) son réplicas duplicadas. Causa típica:
        # el workflow n8n de sync de opps hace CREATE en vez de UPDATE (no encontró
        # la opp existente al replicar). Es INVISIBLE para el bucket de huérfanas
        # (que trata el link como salvaguarda y nunca verifica unicidad).
        #
        # Por cada cluster se recomienda conservar la canónica y borrar las
        # sobrantes según la jerarquía de resolución de duplicados:
        #   (1) monetary_value mayor, (2) status activo (won/open) > lost/abandoned,
        #   (3) más antiguo > reciente [requiere createdAt en vivo -> el limpiador
        #   lo resuelve], (4) TIENDA. Cuando valor y status empatan se marca
        #   tie_break_needs_live_createdat=True para que el limpiador desempate.
        opps_in_brand_duplicate_link = []

        # Índice de opps de sucursal por id nativo -> (location_id, branch_name)
        # para nombrar el origen del link de cada cluster.
        branch_opp_owner_by_id = {}
        for loc, data in branch_data.items():
            for o in data["opps"]:
                branch_opp_owner_by_id[o["id"]] = (loc, data["name"])

        brand_contact_name_by_id = {c["id"]: fmt_contact(c)["name"] for c in brand_contacts}

        # Agrupa TODAS las opps de Marca por su valor de link válido (20 chars).
        brand_opps_link_groups = defaultdict(list)
        for o in brand_opps:
            v = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
            if v and OPP_ID_PATTERN.match(str(v)):
                brand_opps_link_groups[v].append(o)

        _STATUS_RANK = {"won": 3, "open": 2, "lost": 1, "abandoned": 0}
        duplicate_link_group_count = 0
        duplicate_link_extra = 0  # opps sobrantes = sum(group_size - 1)
        for link_value, group in brand_opps_link_groups.items():
            if len(group) < 2:
                continue
            duplicate_link_group_count += 1
            duplicate_link_extra += len(group) - 1
            owner_loc, owner_name = branch_opp_owner_by_id.get(link_value, (None, None))

            def _rank(o):
                return (
                    float(o.get("monetary_value") or 0),
                    _STATUS_RANK.get((o.get("status") or "").lower(), 0),
                )
            ordered = sorted(group, key=_rank, reverse=True)
            top, second = ordered[0], ordered[1]
            tie = _rank(top) == _rank(second)
            for idx, o in enumerate(ordered):
                opps_in_brand_duplicate_link.append({
                    "id": o["id"],
                    "name": o.get("name") or "",
                    "status": o.get("status") or "",
                    "monetary_value": o.get("monetary_value") or 0,
                    "contact_id": o.get("contact_id") or "",
                    "contact_name": brand_contact_name_by_id.get(o.get("contact_id"), ""),
                    "link_value": link_value,
                    "branch_opp_id": link_value,
                    "branch_location_id": owner_loc or "",
                    "branch_name": owner_name or "(sucursal no cacheada)",
                    "group_size": len(group),
                    "recommended_action": "keep" if idx == 0 else "delete",
                    "tie_break_needs_live_createdat": tie,
                })

        # ----------------------------------------------------------------------
        # 3c) Contactos con el campo "ID Contacto Sucursal" vacío o inválido
        # ----------------------------------------------------------------------
        # Paralelo al bucket de opps. Sucursales son accionables (botón llenado =
        # contact.id propio); Marca es informativo (su campo se resuelve por
        # matcheo/sync workflow, no se llena manualmente).
        contacts_missing_id_field = []

        for c in brand_contacts:
            v = extract_contact_link_value(c.get("custom_fields_json"), brand_contact_link_field_id)
            reason = _classify_link(v)
            if reason is None:
                continue
            contacts_missing_id_field.append({
                "id": c["id"],
                "first_name": c.get("first_name") or "",
                "last_name": c.get("last_name") or "",
                "phone": c.get("phone") or "",
                "email": c.get("email") or "",
                "location_id": BRAND_LOCATION_ID,
                "location_name": brand["nombre"],
                "is_brand": True,
                "field_value": v or "",
                "field_len": len(str(v)) if v else 0,
                "reason": reason,
            })

        for loc, data in branch_data.items():
            branch_contact_link_fid = resolve_contact_link_field_id(conn, loc)
            for c in data["contacts"]:
                v = extract_contact_link_value(c.get("custom_fields_json"), branch_contact_link_fid)
                reason = _classify_link(v)
                if reason is None:
                    continue
                contacts_missing_id_field.append({
                    "id": c["id"],
                    "first_name": c.get("first_name") or "",
                    "last_name": c.get("last_name") or "",
                    "phone": c.get("phone") or "",
                    "email": c.get("email") or "",
                    "location_id": loc,
                    "location_name": data["name"],
                    "is_brand": False,
                    "field_value": v or "",
                    "field_len": len(str(v)) if v else 0,
                    "reason": reason,
                })

        # ----------------------------------------------------------------------
        # 4) Desglose por sucursal
        # ----------------------------------------------------------------------
        for loc, data in branch_data.items():
            per_branch_summary.append({
                "location_id": loc,
                "name": data["name"],
                "contacts": len(data["contacts"]),
                "opportunities": len(data["opps"]),
            })
        per_branch_summary.sort(key=lambda x: x["name"])

        # ----------------------------------------------------------------------
        # 5) Duplicados intra-Marca: mismo nombre normalizado, sin phone NI email
        # ----------------------------------------------------------------------
        intra_brand_duplicates = []
        name_groups = defaultdict(list)
        for c in brand_contacts:
            if normalize_phone(c.get("phone")) or normalize_email(c.get("email")):
                continue
            full_name = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
            n = " ".join(strip_accents(full_name).lower().split())
            if not n:
                continue
            name_groups[n].append(c)

        # Pre-indexar contactos sucursal por name_norm (solo los que no tienen
        # phone ni email), para encontrar candidatos de sync para "unico restante".
        branch_no_pe_by_name = defaultdict(list)
        for loc, data in branch_data.items():
            from collections import Counter as _Counter
            opps_by_cid = _Counter(o.get("contact_id") for o in data["opps"] if o.get("contact_id"))
            for c in data["contacts"]:
                if normalize_phone(c.get("phone")) or normalize_email(c.get("email")):
                    continue
                full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
                nm = " ".join(strip_accents(full).lower().split())
                if not nm:
                    continue
                branch_no_pe_by_name[nm].append({
                    "id": c["id"],
                    "location_id": loc,
                    "branch_name": data["name"],
                    "opps_count": opps_by_cid.get(c["id"], 0),
                })

        group_count = 0
        for name_norm, ccs in name_groups.items():
            if len(ccs) < 2:
                continue
            group_count += 1
            candidates = branch_no_pe_by_name.get(name_norm, [])
            # Ordenar por fecha_added desc, ponemos primero los recientes
            sorted_ccs = sorted(ccs, key=lambda x: (x.get("date_added") or ""), reverse=True)
            for c in sorted_ccs:
                intra_brand_duplicates.append({
                    **fmt_contact(c),
                    "name_norm": name_norm,
                    "group_size": len(ccs),
                    "opps_in_brand": len(brand_opps_by_cid.get(c["id"], [])),
                    "date_added": c.get("date_added") or "",
                    "branch_candidates": candidates,
                })

        # ----------------------------------------------------------------------
        # Resumen
        # ----------------------------------------------------------------------
        contact_diff = len(brand_contacts) - total_branch_contacts
        opp_diff = len(brand_opps) - total_branch_opps

        def maybe_limit(lst):
            if limit_missing is None or len(lst) <= limit_missing:
                return lst, len(lst), False
            return lst[:limit_missing], len(lst), True

        missing_in_brand_lim, mib_total, mib_trunc = maybe_limit(missing_in_brand)
        missing_in_assigned_lim, mia_total, mia_trunc = maybe_limit(missing_in_assigned_branch)
        present_other_lim, pother_total, pother_trunc = maybe_limit(present_in_other_branch_not_assigned)
        probable_dup_lim, probable_dup_total, probable_dup_trunc = maybe_limit(probable_duplicate_in_brand)
        brand_without_tienda_lim, bwt_total, bwt_trunc = maybe_limit(brand_without_tienda)
        brand_unknown_tienda_lim, but_total, but_trunc = maybe_limit(brand_with_unknown_tienda)
        brand_not_any_lim, bna_total, bna_trunc = maybe_limit(brand_not_in_any_branch)
        missing_opps_lim, mo_total, mo_trunc = maybe_limit(missing_opps_in_brand)
        opps_missing_id_lim, omif_total, omif_trunc = maybe_limit(opps_missing_id_field)
        dup_link_lim, dup_link_total, dup_link_trunc = maybe_limit(opps_in_brand_duplicate_link)
        contacts_missing_id_lim, cmif_total, cmif_trunc = maybe_limit(contacts_missing_id_field)
        dup_lim, dup_total, dup_trunc = maybe_limit(intra_brand_duplicates)

        return {
            "totals": {
                "brand": {
                    "name": brand["nombre"],
                    "location_id": BRAND_LOCATION_ID,
                    "contacts": len(brand_contacts),
                    "opportunities": len(brand_opps),
                },
                "branches_aggregate": {
                    "branch_count": len(branches),
                    "contacts": total_branch_contacts,
                    "opportunities": total_branch_opps,
                },
                "diff": {
                    "contacts": contact_diff,
                    "opportunities": opp_diff,
                    "contacts_match": contact_diff == 0,
                    "opportunities_match": opp_diff == 0,
                },
            },
            "demos_excluded": [
                {"location_id": d["location_id"], "name": d["nombre"]} for d in demos
            ],
            "per_branch": per_branch_summary,
            "missing": {
                "contacts_in_branch_not_in_brand": {
                    "total": mib_total,
                    "items": missing_in_brand_lim,
                    "truncated": mib_trunc,
                },
                "contacts_in_brand_not_in_assigned_branch": {
                    "total": mia_total,
                    "items": missing_in_assigned_lim,
                    "truncated": mia_trunc,
                },
                "contacts_in_brand_present_in_other_branch_not_assigned": {
                    "total": pother_total,
                    "items": present_other_lim,
                    "truncated": pother_trunc,
                },
                "contacts_in_brand_probable_duplicate": {
                    "total": probable_dup_total,
                    "items": probable_dup_lim,
                    "truncated": probable_dup_trunc,
                },
                "contacts_in_brand_without_tienda": {
                    "total": bwt_total,
                    "items": brand_without_tienda_lim,
                    "truncated": bwt_trunc,
                },
                "contacts_in_brand_with_unknown_tienda": {
                    "total": but_total,
                    "items": brand_unknown_tienda_lim,
                    "truncated": but_trunc,
                },
                "contacts_in_brand_not_in_any_branch": {
                    "total": bna_total,
                    "items": brand_not_any_lim,
                    "truncated": bna_trunc,
                },
                "opportunities_in_branch_not_in_brand": {
                    "total": mo_total,
                    "items": missing_opps_lim,
                    "truncated": mo_trunc,
                },
                "opportunities_missing_id_field": {
                    "total": omif_total,
                    "items": opps_missing_id_lim,
                    "truncated": omif_trunc,
                },
                "opportunities_in_brand_duplicate_link": {
                    "total": dup_link_total,
                    "items": dup_link_lim,
                    "truncated": dup_link_trunc,
                    "group_count": duplicate_link_group_count,
                    "extra_opps": duplicate_link_extra,
                },
                "contacts_missing_id_field": {
                    "total": cmif_total,
                    "items": contacts_missing_id_lim,
                    "truncated": cmif_trunc,
                },
                "intra_brand_duplicates": {
                    "total": dup_total,
                    "items": dup_lim,
                    "truncated": dup_trunc,
                    "group_count": group_count,
                },
            },
            "meta": {
                "brand_tienda_field_id": brand_tienda_field_id,
                "verifier_loaded": bool(verifier_by_loc),
                "verifier_entries": len(verifier_by_loc),
                "brand_present_in_any_branch": brand_present_in_any_branch,
            },
        }
    finally:
        conn.close()


# ---------------------------------------------------------------------------
# CLI / impresion
# ---------------------------------------------------------------------------

def print_report(data, show_missing=False, missing_cap=20):
    t = data["totals"]
    b = t["brand"]
    a = t["branches_aggregate"]
    d = t["diff"]

    safe_print("=" * 72)
    safe_print("COMPARATIVA MARCA vs SUCURSALES (excluye cuentas demo)")
    safe_print("=" * 72)
    safe_print(f"  Marca: {b['name']} ({b['location_id']})")
    safe_print(f"    Contactos       : {b['contacts']:>8}")
    safe_print(f"    Oportunidades   : {b['opportunities']:>8}")
    safe_print(f"  Sucursales activas: {a['branch_count']}")
    safe_print(f"    Contactos suma  : {a['contacts']:>8}")
    safe_print(f"    Oportunidades   : {a['opportunities']:>8}")
    safe_print("-" * 72)
    status_c = "OK (iguales)" if d["contacts_match"] else f"DESCUADRE: {d['contacts']:+}"
    status_o = "OK (iguales)" if d["opportunities_match"] else f"DESCUADRE: {d['opportunities']:+}"
    safe_print(f"  Diff contactos     : {status_c}")
    safe_print(f"  Diff oportunidades : {status_o}")

    demos = data["demos_excluded"]
    if demos:
        safe_print("-" * 72)
        safe_print(f"  Cuentas demo excluidas ({len(demos)}):")
        for d_acc in demos:
            safe_print(f"    - {d_acc['name']} ({d_acc['location_id']})")

    safe_print("=" * 72)
    safe_print("Desglose por sucursal:")
    safe_print(f"  {'Sucursal':<45} {'Cont.':>8} {'Opps':>8}")
    safe_print("  " + "-" * 64)
    for row in data["per_branch"]:
        name = row["name"][:44]
        safe_print(f"  {name:<45} {row['contacts']:>8} {row['opportunities']:>8}")

    m = data["missing"]
    safe_print("=" * 72)
    safe_print("Resumen de huecos detectados:")
    safe_print(f"  Contactos en sucursal sin contraparte en Marca           : {m['contacts_in_branch_not_in_brand']['total']}")
    safe_print(f"  Contactos en Marca sin presencia en la sucursal asignada : {m['contacts_in_brand_not_in_assigned_branch']['total']}")
    safe_print(f"  Probables duplicados en Marca (homonimo en sucursal)     : {m['contacts_in_brand_probable_duplicate']['total']}")
    safe_print(f"  Contactos en Marca sin TIENDA poblada                    : {m['contacts_in_brand_without_tienda']['total']}")
    safe_print(f"  Contactos en Marca con TIENDA desconocida en verificador : {m['contacts_in_brand_with_unknown_tienda']['total']}")
    safe_print(f"  Contactos en Marca sin contraparte en NINGUNA sucursal   : {m['contacts_in_brand_not_in_any_branch']['total']}")
    safe_print(f"  Oportunidades en sucursal sin replica en Marca           : {m['opportunities_in_branch_not_in_brand']['total']}")
    _dup = m.get("opportunities_in_brand_duplicate_link", {})
    safe_print(f"  Replicas DUPLICADAS en Marca (mismo ID Opp Sucursal)     : {_dup.get('total', 0)} en {_dup.get('group_count', 0)} grupos ({_dup.get('extra_opps', 0)} sobrantes)")

    if show_missing:
        def dump(title, key, formatter):
            block = m[key]
            if not block["total"]:
                return
            safe_print("-" * 72)
            safe_print(f"  {title} ({block['total']} total, mostrando hasta {missing_cap}):")
            for item in block["items"][:missing_cap]:
                safe_print("    - " + formatter(item))

        dump(
            "Contactos en sucursal no presentes en Marca",
            "contacts_in_branch_not_in_brand",
            lambda i: f"{i['name']} | {i['phone'] or i['email'] or '(sin contacto)'} | sucursal: {i['branch_name']} | opps locales: {i['opps_in_branch']}",
        )
        dump(
            "Contactos en Marca ausentes de su sucursal asignada",
            "contacts_in_brand_not_in_assigned_branch",
            lambda i: f"{i['name']} | tienda='{i.get('tienda')}' | esperado: {i['expected_branch_name']} | opps en marca: {i['opps_in_brand']} | esta en otra sucursal: {i['present_in_other_branch']}",
        )
        dump(
            "Probables duplicados en Marca (existe homonimo con tel/email en sucursal asignada)",
            "contacts_in_brand_probable_duplicate",
            lambda i: f"{i['name']} | tienda='{i.get('tienda')}' | esperado: {i['expected_branch_name']} | brand_phone={i.get('phone')!r} | branch_phone={(i.get('branch_existing_contact') or {}).get('phone')!r}",
        )
        dump(
            "Oportunidades en sucursal sin replica en Marca",
            "opportunities_in_branch_not_in_brand",
            lambda i: f"{i['name']} [{i['status']}] | ${i['monetary_value']:.0f} | contacto: {i['contact_name']} | sucursal: {i['branch_name']} | motivo: {i['reason']}",
        )
        dump(
            "Replicas DUPLICADAS en Marca (mismo ID Oportunidad Sucursal)",
            "opportunities_in_brand_duplicate_link",
            lambda i: f"{i['recommended_action'].upper():>6} | {i['name']} [{i['status']}] | ${i['monetary_value']:.0f} | opp={i['id']} | link={i['link_value']} | origen: {i['branch_name']} | grupo de {i['group_size']}",
        )


def main():
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--show-missing", action="store_true", help="Imprime los listados de ausentes.")
    parser.add_argument("--limit-missing", type=int, default=None, help="Limita el listado interno antes de imprimir (default sin limite).")
    parser.add_argument("--missing-cap", type=int, default=20, help="Cuantos items imprimir por listado cuando --show-missing.")
    parser.add_argument("--json", action="store_true", help="Imprime el resultado como JSON en vez del reporte humano.")
    args = parser.parse_args()

    try:
        data = run_audit(limit_missing=args.limit_missing)
    except FileNotFoundError as e:
        safe_print(f"ERROR: {e}")
        sys.exit(2)
    except RuntimeError as e:
        safe_print(f"ERROR: {e}")
        sys.exit(3)

    if args.json:
        safe_print(json.dumps(data, ensure_ascii=False, indent=2))
        return

    print_report(data, show_missing=args.show_missing, missing_cap=args.missing_cap)


if __name__ == "__main__":
    main()