Files
MP-Manager/scripts/audit_brand_vs_branches_totals.py
T
2026-05-30 14:31:19 -06:00

1606 lines
71 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""audit_brand_vs_branches_totals.py
Comparativa de conteos totales entre la cuenta de Marca Principal (Monte
Providencia) y la suma de TODAS las sucursales. Las cuentas demo se excluyen
automaticamente (nombre contiene 'demo', case-insensitive).
Lo que produce:
- Totales agregados (contactos y oportunidades) Marca vs Sucursales.
- Desglose por sucursal con el conteo local.
- Listado de contactos en sucursal que no estan en Marca.
- Listado de contactos en Marca que no estan en la sucursal que les
corresponde segun el verificador (campo TIENDA del contacto Marca cruzado
con la columna TIENDA del verificador y la columna ID LOCATION BUCEFALO).
- Listado de oportunidades en sucursal sin contraparte en Marca.
Es read-only sobre `mp_manager.sqlite`. No toca GHL. La logica vive en
`run_audit()` para que el endpoint del dashboard la reutilice.
Uso CLI:
python scripts/audit_brand_vs_branches_totals.py
python scripts/audit_brand_vs_branches_totals.py --show-missing
python scripts/audit_brand_vs_branches_totals.py --json
python scripts/audit_brand_vs_branches_totals.py --limit-missing 100
"""
import argparse
import csv
import json
import os
import re
import sqlite3
import sys
import unicodedata
from collections import defaultdict
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
if SCRIPTS_DIR not in sys.path:
sys.path.insert(0, SCRIPTS_DIR)
from paths import DB_PATH
from common import match_contacts as _match_contacts
VERIFIER_CSV = os.path.join(
ROOT_DIR, "Monte Providencia - Verificador de sucursales y correos - Sucursales.csv"
)
BRAND_LOCATION_ID = "GbKkBpCmKu2QmloKFHy3"
MATCH_THRESHOLD = 0.80
# Consolidacion de hubs digitales. Algunas sucursales "shell" (fisicas / no
# digitales) no reciben leads digitales: estos viven en la location "hub" que
# las absorbe. El Verificador CSV no siempre tiene la fila digital -> hub para
# cada tienda (p.ej. TIENDA=METEPEC solo mapea a 85937, que esta vacia, mientras
# los leads viven en Pilares 85935). Sin este mapa, esos contactos aparecen como
# "presente en otra sucursal, no la asignada" -> ruido masivo de falsos positivos.
# Documentado en la memoria `verificador_tipo_de_tienda_colapso` y en Baserow 750
# (cluster Toluca / Metepec / Lerma -> Pilares 85935).
PILARES_HUB_LOC = "uZnMH5bO6MXTHcgHeyq9" # 85935 - MP - Pilares (hub digital)
DIGITAL_HUB_BY_SHELL = {
"NSDniGzjxotVDNa5YxqW": PILARES_HUB_LOC, # 85937 - MP - METEPEC
"Xqpdy12avIk4NFsOhPBX": PILARES_HUB_LOC, # 85941 - MP - Grand Plaza
"pMPs9M4RaGJvWwfIFVIo": PILARES_HUB_LOC, # 85941 - MP - Grand Plaza Toluca
"RLAs9sQwbW2DOwzrTMYI": PILARES_HUB_LOC, # 85939 - MP - Independencia
"UsHXqoj2l6ND7Uc7sEo2": PILARES_HUB_LOC, # 85938 - MP - SENDERO
"lWp7F6rsgTjy3voFBZ1m": PILARES_HUB_LOC, # 85935 - MP - Lerma
"clhDZ0hIllKfV0AcgW53": PILARES_HUB_LOC, # 85940 - MP - Isidro Fabela (0 contactos, NO DIGITAL)
}
DEMO_PATTERN = re.compile(r"\bdemo\b", re.IGNORECASE)
# ---------------------------------------------------------------------------
# Utilidades
# ---------------------------------------------------------------------------
def safe_print(*args, **kwargs):
sep = kwargs.get("sep", " ")
end = kwargs.get("end", "\n")
text = sep.join(str(a) for a in args)
encoding = sys.stdout.encoding or "utf-8"
try:
sys.stdout.write(text + end)
sys.stdout.flush()
except UnicodeEncodeError:
sys.stdout.write(text.encode(encoding, errors="replace").decode(encoding) + end)
sys.stdout.flush()
def strip_accents(value):
if not value:
return ""
nfkd = unicodedata.normalize("NFD", str(value))
return "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
def normalize_phone(phone, last_n=10):
digits = re.sub(r"\D+", "", str(phone or ""))
return digits[-last_n:] if len(digits) >= last_n else digits
def normalize_email(email):
return str(email or "").strip().lower()
def normalize_tienda(value):
"""Normaliza nombres de tienda para matching: sin acentos, mayusculas, espacios colapsados."""
if not value:
return ""
return " ".join(strip_accents(str(value)).upper().split())
def is_demo_account(name):
return bool(name and DEMO_PATTERN.search(name))
# Detecta contactos que parecen ser de prueba: keywords sueltas (rodeadas de
# separadores no alfanuméricos) en cualquiera de los campos basicos. Quiero
# que matchee 'Juan Prueba', 'test@test.com', '+52 prueba', pero NO falsos
# positivos como 'Pruebal' o 'Contestino'.
TEST_KEYWORDS_PATTERN = re.compile(
r"(?:^|[^a-z0-9])(test|testing|prueba|pruebas)(?:$|[^a-z0-9])",
re.IGNORECASE,
)
def looks_like_test_contact(c):
"""True si el nombre, email o teléfono contienen keywords de prueba."""
haystack_parts = [
c.get("first_name") or "",
c.get("last_name") or "",
c.get("email") or "",
c.get("phone") or "",
]
haystack = " ".join(strip_accents(p).lower() for p in haystack_parts if p)
if not haystack:
return False
return bool(TEST_KEYWORDS_PATTERN.search(haystack))
def fmt_contact(c):
name = f"{c.get('first_name') or ''} {c.get('last_name') or ''}".strip() or "Sin nombre"
return {
"id": c.get("id"),
"name": name,
"phone": c.get("phone") or "",
"email": c.get("email") or "",
}
# ---------------------------------------------------------------------------
# Carga de datos
# ---------------------------------------------------------------------------
def load_verifier():
"""{location_id: {tienda_norm, sucursal_label}, plus reverse index tienda_norm -> location_id}."""
by_location = {}
by_tienda = {}
if not os.path.exists(VERIFIER_CSV):
return by_location, by_tienda
with open(VERIFIER_CSV, encoding="utf-8-sig", newline="") as fh:
for row in csv.DictReader(fh):
loc = (row.get("ID LOCATION BUCEFALO") or "").strip()
if not loc:
continue
tienda_raw = (row.get("TIENDA") or "").strip()
sucursal_raw = (row.get("SUCURSAL") or "").strip()
tienda_norm = normalize_tienda(tienda_raw) if tienda_raw and tienda_raw != "-" else None
by_location[loc] = {
"tienda_raw": tienda_raw,
"tienda_norm": tienda_norm,
"sucursal": sucursal_raw if sucursal_raw and sucursal_raw != "-" else None,
}
if tienda_norm and loc != BRAND_LOCATION_ID:
# Una tienda puede aparecer en varias filas del verificador, pero
# el ID LOCATION BUCEFALO es el discriminante. Si dos sucursales
# comparten TIENDA en el CSV, nos quedamos con la primera.
by_tienda.setdefault(tienda_norm, loc)
return by_location, by_tienda
def resolve_tienda_field_id(conn, location_id):
"""Lee object_schemas para encontrar el field id de 'TIENDA' en una location."""
rows = conn.execute(
"SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'",
(location_id,),
).fetchall()
for r in rows:
if strip_accents(r["field_name"]).lower().strip() == "tienda":
return r["field_id"]
return None
def extract_tienda_from_custom_fields(custom_fields_json, field_id):
if not field_id or not custom_fields_json:
return None
try:
cfs = json.loads(custom_fields_json)
except Exception:
return None
if not isinstance(cfs, list):
return None
for cf in cfs:
if cf.get("id") == field_id or cf.get("fieldId") == field_id:
for key in ("value", "fieldValue", "fieldValueString"):
v = cf.get(key)
if v is not None and v != "":
return v
return None
def resolve_sucursal_field_id(conn, location_id):
"""Field id del CF 'Sucursal' en la location dada."""
rows = conn.execute(
"SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'",
(location_id,),
).fetchall()
for r in rows:
if strip_accents(r["field_name"]).lower().strip() == "sucursal":
return r["field_id"]
return None
# Campo de oportunidad que vincula Sucursal<->Marca (llave determinística).
OPP_LINK_FIELD_KEY = "opportunity.id_oportunidad_sucursal"
OPP_LINK_FIELD_NAME = "id oportunidad sucursal"
# Los ids nativos de GHL son exactamente 20 chars alfanuméricos.
OPP_ID_PATTERN = re.compile(r"^[A-Za-z0-9]{20}$")
def resolve_opp_link_field_id(conn, location_id):
"""Field id del CF 'ID Oportunidad Sucursal' (opportunity) en una location."""
rows = conn.execute(
"SELECT field_id, field_name, field_key FROM object_schemas "
"WHERE location_id=? AND object_key='opportunity'",
(location_id,),
).fetchall()
for r in rows:
if r["field_key"] == OPP_LINK_FIELD_KEY:
return r["field_id"]
for r in rows:
if strip_accents(r["field_name"]).lower().strip() == OPP_LINK_FIELD_NAME:
return r["field_id"]
return None
def extract_opp_link_value(custom_fields_json, field_id):
"""Valor del CF 'ID Oportunidad Sucursal' en una opp (None si vacío/ausente)."""
if not field_id or not custom_fields_json:
return None
try:
cfs = json.loads(custom_fields_json)
except Exception:
return None
if not isinstance(cfs, list):
return None
for cf in cfs:
if cf.get("id") == field_id or cf.get("fieldId") == field_id:
for key in ("value", "fieldValue", "fieldValueString"):
v = cf.get(key)
if v is not None and v != "":
return v
return None
# Campo de contacto que vincula Sucursal<->Marca (llave determinística, paralelo a opp).
CONTACT_LINK_FIELD_KEY = "contact.id_contacto_sucursal"
CONTACT_LINK_FIELD_NAME = "id contacto sucursal"
def resolve_contact_link_field_id(conn, location_id):
"""Field id del CF 'ID Contacto Sucursal' (contact) en una location."""
rows = conn.execute(
"SELECT field_id, field_name, field_key FROM object_schemas "
"WHERE location_id=? AND object_key='contact'",
(location_id,),
).fetchall()
for r in rows:
if r["field_key"] == CONTACT_LINK_FIELD_KEY:
return r["field_id"]
for r in rows:
if strip_accents(r["field_name"]).lower().strip() == CONTACT_LINK_FIELD_NAME:
return r["field_id"]
return None
def extract_contact_link_value(custom_fields_json, field_id):
"""Valor del CF 'ID Contacto Sucursal' en un contacto (None si vacío/ausente)."""
if not field_id or not custom_fields_json:
return None
try:
cfs = json.loads(custom_fields_json)
except Exception:
return None
if not isinstance(cfs, list):
return None
for cf in cfs:
if cf.get("id") == field_id or cf.get("fieldId") == field_id:
for key in ("value", "fieldValue", "fieldValueString"):
v = cf.get(key)
if v is not None and v != "":
return v
return None
# Patrones de normalizacion para matching tolerante de "Sucursal".
# Convierten abreviaturas comunes a su forma canonica antes de comparar.
SUCURSAL_ABBREV_PATTERNS = [
(re.compile(r"\bedo\.?\s*de\s*mex\.?\b"), "estado de mexico"),
(re.compile(r"\bedo\.?\s*mex\.?\b"), "estado de mexico"),
(re.compile(r"\bedomex\b"), "estado de mexico"),
(re.compile(r"\bedo\b"), "estado de mexico"),
(re.compile(r"\bcdmx\b"), "ciudad de mexico"),
(re.compile(r"\bd\.?\s*f\.?\b"), "ciudad de mexico"),
# "Cd." / "Cd " como prefijo de nombre propio (Cd. Satélite, Cd Juárez).
# Va después de \bcdmx\b para que "cdmx" no se rompa.
(re.compile(r"\bcd\.?\s+"), "ciudad "),
(re.compile(r"\bn\.?\s*l\.?\b"), "nuevo leon"),
(re.compile(r"\bqro\.?\b"), "queretaro"),
(re.compile(r"\bpue\.?\b"), "puebla"),
(re.compile(r"\bgto\.?\b"), "guanajuato"),
(re.compile(r"\bmich\.?\b"), "michoacan"),
(re.compile(r"\bmor\.?\b"), "morelos"),
(re.compile(r"\boax\.?\b"), "oaxaca"),
(re.compile(r"\bgro\.?\b"), "guerrero"),
(re.compile(r"\bhgo\.?\b"), "hidalgo"),
(re.compile(r"\btams\.?\b"), "tamaulipas"),
(re.compile(r"\btamps\.?\b"), "tamaulipas"),
(re.compile(r"\bchis\.?\b"), "chiapas"),
(re.compile(r"\bcamp\.?\b"), "campeche"),
(re.compile(r"\bq\.?\s*roo\b"), "quintana roo"),
(re.compile(r"\bcoah\.?\b"), "coahuila"),
]
def normalize_sucursal_value(value):
"""Normaliza Sucursal para matching: sin acentos, lowercase, sin puntos/comas,
abreviaturas estatales expandidas, espacios colapsados.
"""
if not value:
return ""
s = strip_accents(str(value)).lower()
s = re.sub(r"[.,]", " ", s)
s = " ".join(s.split())
for pat, rep in SUCURSAL_ABBREV_PATTERNS:
s = pat.sub(rep, s)
return " ".join(s.split())
def resolve_location_from_sucursal(sucursal_value, verifier_by_loc):
"""Intenta mapear el valor del CF Sucursal a un location_id usando matching
tolerante contra la columna SUCURSAL del verificador.
Devuelve (location_id, match_kind) o (None, None). match_kind es
"exact" o "substring" para que la UI pueda mostrar confianza.
"""
if not sucursal_value:
return None, None
target = normalize_sucursal_value(sucursal_value)
if not target:
return None, None
exact_hit = None
substring_hits = [] # (loc_id, longitud_diferencia)
for loc_id, info in verifier_by_loc.items():
if loc_id == BRAND_LOCATION_ID:
continue
ver_sucursal = info.get("sucursal")
if not ver_sucursal:
continue
ver_norm = normalize_sucursal_value(ver_sucursal)
if not ver_norm:
continue
if target == ver_norm:
exact_hit = loc_id
break
# Match tipo substring bidireccional: uno contiene al otro como subcadena.
if target in ver_norm or ver_norm in target:
substring_hits.append((loc_id, abs(len(ver_norm) - len(target))))
if exact_hit:
return exact_hit, "exact"
if len(substring_hits) == 1:
return substring_hits[0][0], "substring"
if len(substring_hits) > 1:
# Varias coincidencias: nos quedamos con la mas cercana en longitud
# solo si es claramente mejor que el segundo lugar.
substring_hits.sort(key=lambda x: x[1])
if len(substring_hits) >= 2 and substring_hits[1][1] - substring_hits[0][1] <= 1:
# Empate cerrado: ambiguo, no resolver.
return None, None
return substring_hits[0][0], "substring"
return None, None
def load_accounts_filtered(conn):
"""Devuelve {brand: dict, branches: [dicts], demos: [dicts]}."""
rows = conn.execute("SELECT location_id, nombre, type FROM accounts").fetchall()
brand = None
branches = []
demos = []
for r in rows:
item = dict(r)
if is_demo_account(item["nombre"]):
demos.append(item)
continue
if item["location_id"] == BRAND_LOCATION_ID:
brand = item
else:
branches.append(item)
return brand, branches, demos
def load_contacts(conn, location_id):
return [
dict(r)
for r in conn.execute(
"SELECT id, first_name, last_name, phone, email, custom_fields_json, date_added "
"FROM contacts WHERE location_id = ?",
(location_id,),
).fetchall()
]
def load_opps(conn, location_id):
return [
dict(r)
for r in conn.execute(
"SELECT id, contact_id, status, name, pipeline_id, monetary_value, custom_fields_json "
"FROM opportunities WHERE location_id = ?",
(location_id,),
).fetchall()
]
# ---------------------------------------------------------------------------
# Matching
# ---------------------------------------------------------------------------
def _contact_full_name_norm(c):
full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
return " ".join(strip_accents(full).lower().split())
def build_contact_index(contacts):
"""Indices para matching de contactos.
Devuelve (by_phone, by_email, by_name). El index by_name contiene TODOS
los contactos del target (con o sin phone/email), porque la regla de
negocio dice que un contacto en Marca sin phone/email proviene siempre
de una sucursal — y en la sucursal el contacto sí puede tener phone/email
(porque por ahí se capturo originalmente vía formulario/etc). La condicion
de seguridad para evitar falsos positivos por homonimos se aplica del lado
source en find_match: solo se intenta match por nombre cuando el source
NO tiene phone NI email.
"""
by_phone = defaultdict(list)
by_email = defaultdict(list)
by_name = defaultdict(list)
for c in contacts:
p = normalize_phone(c.get("phone"))
e = normalize_email(c.get("email"))
if p:
by_phone[p].append(c)
if e:
by_email[e].append(c)
n = _contact_full_name_norm(c)
if n:
by_name[n].append(c)
return by_phone, by_email, by_name
def find_match(contact, by_phone, by_email, by_name=None,
return_collisions=False, threshold=MATCH_THRESHOLD):
"""Busca matches en cascada: phone+nombre -> email -> nombre.
Reglas:
1. Si el source tiene phone, intenta match por phone PERO sólo cuenta
como match si además el nombre coincide (vía common.match_contacts)
con similitud >= threshold. Si el teléfono coincide pero el nombre
diverge → es una colisión (caso pareja con mismo número) y NO se
incluye como match; se acumula en `collisions` para reporte.
2. Si el source tiene email, intenta match por email (sin requerir
nombre — email es identificador más fuerte y la colisión es muy rara).
3. Si el source NO tiene phone NI email, intenta match por nombre contra
todos los contactos del target.
Args:
return_collisions: si True, devuelve (matches, collisions). Si False
(default, back-compat), devuelve sólo matches.
"""
p = normalize_phone(contact.get("phone"))
e = normalize_email(contact.get("email"))
seen, matches, collisions = set(), [], []
# IDs marcados como colisión por phone: NO deben ser luego "rescatados"
# como match por email — la divergencia de nombre invalida el grupo
# incluso si email coincide (suele indicar datos confundidos por la
# integración, no la misma persona).
phone_collision_ids = set()
if p and p in by_phone:
for m in by_phone[p]:
if m["id"] in seen:
continue
result = _match_contacts(contact, m, threshold=threshold)
if result["level"] in ("strong", "medium"):
matches.append(m)
seen.add(m["id"])
else:
collisions.append(m)
phone_collision_ids.add(m["id"])
if e and e in by_email:
for m in by_email[e]:
if m["id"] in seen or m["id"] in phone_collision_ids:
continue
matches.append(m)
seen.add(m["id"])
if by_name is not None and not p and not e:
n = _contact_full_name_norm(contact)
if n and n in by_name:
for m in by_name[n]:
if m["id"] not in seen:
matches.append(m)
seen.add(m["id"])
if return_collisions:
return matches, collisions
return matches
# ---------------------------------------------------------------------------
# Fuzzy matching (modalidades permisivas) — usado para detectar "posibles
# coincidencias" en otras sucursales cuando el match estricto no las encuentra.
# Output puramente informativo: NO mueve contactos a otro bucket, solo se
# adjunta como advertencia a cada item para que el operador decida.
# ---------------------------------------------------------------------------
_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}
def normalize_phone_partial(phone, last_n=7):
"""Phone normalizado a los ultimos N digitos (default 7)."""
digits = re.sub(r"\D+", "", str(phone or ""))
return digits[-last_n:] if len(digits) >= last_n else ""
def email_local_part(email):
"""Parte local del email (antes del @), normalizada a lowercase."""
e = normalize_email(email)
if "@" not in e:
return ""
return e.split("@", 1)[0]
def email_canonical(email):
"""Email canonico: lowercase + strip + remueve '+alias' y, para gmail,
elimina puntos en la parte local. Permite detectar el mismo email aunque
haya sido escrito con variaciones (juan.perez+spam@gmail = juanperez@gmail).
"""
e = normalize_email(email)
if "@" not in e:
return ""
local, _, domain = e.partition("@")
if "+" in local:
local = local.split("+", 1)[0]
if domain in _GMAIL_DOMAINS:
local = local.replace(".", "")
domain = "gmail.com" # unifica googlemail con gmail
return f"{local}@{domain}" if local else ""
def first_last_norm(c):
"""Devuelve 'primer_token_first_name ultimo_token_last_name' normalizado.
Tolera divisiones inconsistentes en los CRMs:
- 'Juan Pablo' en first_name + 'Franco' en last_name -> 'juan franco'
- 'Juan' en first_name + 'Pablo Franco' en last_name -> 'juan franco'
- 'Juan' en first_name + 'Franco Gutierrez' (paterno+materno) -> 'juan gutierrez'
Esto evita el falso positivo clasico de usar el primer token del apellido,
que colisiona cuando el "primer apellido" es en realidad un middle name
(p.ej. 'juan pablo franco' vs 'juan pablo jimenez' tendrian la misma key
'juan pablo' si tomaramos primer-first + primer-last). Tomando el ultimo
token del apellido el match exige coincidencia en el apellido real.
"""
fn = (c.get("first_name") or "").strip()
ln = (c.get("last_name") or "").strip()
if not fn and not ln:
return ""
first_tok = strip_accents(fn).lower().split()
last_tok = strip_accents(ln).lower().split()
first = first_tok[0] if first_tok else ""
last = last_tok[-1] if last_tok else ""
if not first or not last:
return ""
return f"{first} {last}"
def build_fuzzy_indexes(contacts):
"""Construye indices fuzzy adicionales. Cada lista guarda dicts ligeros con
id + meta (location_id se inyecta antes, ver _augment_global_contacts).
"""
by_phone_partial = defaultdict(list)
by_email_local = defaultdict(list)
by_email_canon = defaultdict(list)
by_first_last = defaultdict(list)
for c in contacts:
pp = normalize_phone_partial(c.get("phone"))
if pp:
by_phone_partial[pp].append(c)
el = email_local_part(c.get("email"))
if el:
by_email_local[el].append(c)
ec = email_canonical(c.get("email"))
if ec:
by_email_canon[ec].append(c)
fl = first_last_norm(c)
if fl:
by_first_last[fl].append(c)
return {
"phone_partial": by_phone_partial,
"email_local": by_email_local,
"email_canonical": by_email_canon,
"first_last": by_first_last,
}
def find_fuzzy_matches(contact, fuzzy_indexes, exclude_ids=None, strict_match_phone=None, strict_match_email=None):
"""Devuelve lista de dicts {id, location_id, location_name, strategy, ...}
con coincidencias fuzzy en OTRAS sucursales. Excluye explicitamente IDs
que ya fueron match estricto (para evitar duplicar la advertencia).
Args:
contact: dict del contacto fuente (Marca).
fuzzy_indexes: dict producido por build_fuzzy_indexes.
exclude_ids: set opcional de IDs a omitir (matches estrictos ya contados).
strict_match_phone: phone normalizado del source — para no incluir
coincidencias fuzzy que en realidad son match estricto por phone.
strict_match_email: email normalizado del source — idem para email.
Las estrategias se ordenan por confianza descendente. Si un mismo contacto
matchea por mas de una estrategia, se conserva la mas fuerte.
"""
exclude_ids = set(exclude_ids or [])
# confianza/etiqueta legible
STRATEGY_LABELS = {
"phone_partial": ("Teléfono parcial (últ. 7 dígitos)", 90),
"email_canonical": ("Email canónico (gmail sin puntos/alias)", 95),
"email_local": ("Misma parte local del email (otro dominio)", 70),
"first_last": ("Mismo nombre + primer apellido", 65),
}
candidates = {} # id -> {dict, score, strategy}
def _add(strategy, ms):
label, score = STRATEGY_LABELS[strategy]
for m in ms:
mid = m.get("id")
if not mid or mid in exclude_ids:
continue
prev = candidates.get(mid)
if prev is None or score > prev["score"]:
candidates[mid] = {"m": m, "score": score, "strategy": strategy, "strategy_label": label}
# phone parcial: solo si el source tiene phone con >=7 digitos.
src_pp = normalize_phone_partial(contact.get("phone"))
if src_pp:
ms = fuzzy_indexes["phone_partial"].get(src_pp, [])
# evitar contar matches que YA son estrictos por phone (mismos ultimos 10).
if strict_match_phone:
ms = [m for m in ms if normalize_phone(m.get("phone")) != strict_match_phone]
_add("phone_partial", ms)
# email canonico: si el source tiene email con dominio.
src_ec = email_canonical(contact.get("email"))
src_email_norm = normalize_email(contact.get("email"))
if src_ec:
ms = fuzzy_indexes["email_canonical"].get(src_ec, [])
if strict_match_email:
ms = [m for m in ms if normalize_email(m.get("email")) != strict_match_email]
_add("email_canonical", ms)
# mismo local part en otro dominio.
src_el = email_local_part(contact.get("email"))
if src_el:
ms = fuzzy_indexes["email_local"].get(src_el, [])
# excluir matches que ya tienen email idéntico (los cubre email_canonical/strict).
ms = [m for m in ms if normalize_email(m.get("email")) != src_email_norm]
_add("email_local", ms)
# mismo first_name + primer apellido.
src_fl = first_last_norm(contact)
if src_fl:
_add("first_last", fuzzy_indexes["first_last"].get(src_fl, []))
out = []
for cid, info in candidates.items():
m = info["m"]
out.append({
"id": cid,
"location_id": m.get("_loc"),
"location_name": m.get("_loc_name"),
"first_name": m.get("first_name"),
"last_name": m.get("last_name"),
"phone": m.get("phone"),
"email": m.get("email"),
"strategy": info["strategy"],
"strategy_label": info["strategy_label"],
"score": info["score"],
})
out.sort(key=lambda x: (-x["score"], x.get("location_name") or ""))
return out
# ---------------------------------------------------------------------------
# Auditoria principal
# ---------------------------------------------------------------------------
def run_audit(limit_missing=None):
"""Ejecuta la comparativa y devuelve un dict JSON-serializable.
Args:
limit_missing: si es int, recorta cada listado de ausentes a N items
(para respuestas API mas ligeras). None = sin recorte.
"""
if not os.path.exists(DB_PATH):
raise FileNotFoundError(
f"No existe {DB_PATH}. Corre una sincronizacion global primero."
)
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
try:
brand, branches, demos = load_accounts_filtered(conn)
if not brand:
raise RuntimeError(
"No se encontro la cuenta de Marca en la tabla accounts. "
"Corre la sincronizacion para poblar el catalogo."
)
verifier_by_loc, verifier_by_tienda = load_verifier()
brand_tienda_field_id = resolve_tienda_field_id(conn, BRAND_LOCATION_ID)
brand_sucursal_field_id = resolve_sucursal_field_id(conn, BRAND_LOCATION_ID)
brand_opp_link_field_id = resolve_opp_link_field_id(conn, BRAND_LOCATION_ID)
brand_contact_link_field_id = resolve_contact_link_field_id(conn, BRAND_LOCATION_ID)
brand_contacts = load_contacts(conn, BRAND_LOCATION_ID)
brand_opps = load_opps(conn, BRAND_LOCATION_ID)
branch_data = {}
total_branch_contacts = 0
total_branch_opps = 0
per_branch_summary = []
for b in branches:
loc = b["location_id"]
bc = load_contacts(conn, loc)
bo = load_opps(conn, loc)
branch_data[loc] = {"contacts": bc, "opps": bo, "name": b["nombre"]}
total_branch_contacts += len(bc)
total_branch_opps += len(bo)
# ---- Indices globales de sucursal (para buscar contraparte de Marca) ----
all_branch_contacts = []
for loc, data in branch_data.items():
for c in data["contacts"]:
aug = dict(c)
aug["_loc"] = loc
aug["_loc_name"] = data["name"]
all_branch_contacts.append(aug)
branch_idx_phone, branch_idx_email, branch_idx_name = build_contact_index(all_branch_contacts)
# Indices fuzzy globales (para detectar "posibles coincidencias" en
# otras sucursales cuando el match estricto no las encuentra). Solo
# se consultan para los items del bucket missing_in_assigned_branch.
branch_fuzzy_indexes = build_fuzzy_indexes(all_branch_contacts)
# Indice por sucursal individual (para verificar si el contacto Marca esta
# en la sucursal especifica que le toca segun el verificador).
per_branch_idx = {}
for loc, data in branch_data.items():
per_branch_idx[loc] = build_contact_index(data["contacts"])
# Opps por contact_id en cada sucursal y en Marca.
brand_opps_by_cid = defaultdict(list)
for o in brand_opps:
brand_opps_by_cid[o["contact_id"]].append(o)
# Índice de opps de Marca por el valor del campo "ID Oportunidad Sucursal"
# (= id nativo de la opp de sucursal de origen). Es la llave determinística
# para el match por campo (criterio principal del bucket de opps faltantes).
brand_opps_by_link = {}
for o in brand_opps:
lv = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
if lv:
brand_opps_by_link.setdefault(lv, o)
# Índice de contactos de Marca por el valor del campo "ID Contacto Sucursal"
# (= id nativo del contacto de sucursal de origen). Llave determinística
# para match por campo en el bucket contacts_in_branch_not_in_brand.
brand_contacts_by_link = {}
for c in brand_contacts:
lv = extract_contact_link_value(c.get("custom_fields_json"), brand_contact_link_field_id)
if lv:
brand_contacts_by_link.setdefault(lv, c)
per_branch_opps_by_cid = {}
for loc, data in branch_data.items():
grouped = defaultdict(list)
for o in data["opps"]:
grouped[o.get("contact_id")].append(o)
per_branch_opps_by_cid[loc] = grouped
# ----------------------------------------------------------------------
# 1) Contactos en sucursal sin contraparte en Marca
# ----------------------------------------------------------------------
brand_idx_phone, brand_idx_email, brand_idx_name = build_contact_index(brand_contacts)
missing_in_brand = [] # contactos sucursal que no estan en Marca
for loc, data in branch_data.items():
for c in data["contacts"]:
# Criterio PRINCIPAL: match por el campo "ID Contacto Sucursal".
# Si existe un contacto Marca cuyo valor de ese campo == id nativo
# de este contacto de sucursal, está replicado. Determinístico.
if c.get("id") in brand_contacts_by_link:
continue
# Respaldo: lógica histórica por phone/email/name.
if find_match(c, brand_idx_phone, brand_idx_email, brand_idx_name):
continue
opps_here = per_branch_opps_by_cid[loc].get(c["id"], [])
missing_in_brand.append({
**fmt_contact(c),
"branch_location_id": loc,
"branch_name": data["name"],
"opps_in_branch": len(opps_here),
})
# ----------------------------------------------------------------------
# 2) Contactos en Marca que no estan en la sucursal que les corresponde
# por el verificador (TIENDA del contacto -> location_id).
# ----------------------------------------------------------------------
missing_in_assigned_branch = [] # NO esta en ninguna sucursal (incluye la asignada). Tiene TIENDA.
present_in_other_branch_not_assigned = [] # Esta en OTRA sucursal, no la asignada.
probable_duplicate_in_brand = [] # Marca tiene phone/email pero hay homónimo con phone/email en la sucursal asignada -> probable duplicado en Marca.
brand_without_tienda = [] # contactos Marca sin TIENDA poblada
brand_with_unknown_tienda = [] # contactos Marca con TIENDA que no matchea ninguna fila del verificador
brand_present_in_any_branch = 0
brand_not_in_any_branch = [] # contacto Marca que no aparece en ninguna sucursal (no asignable)
# Pre-indexar nombres por sucursal para encontrar el candidato exacto en la
# sucursal asignada cuando el contacto Marca no matchea por phone/email.
per_branch_name_idx = {}
per_branch_by_id = {}
for loc, data in branch_data.items():
name_idx = {}
by_id = {}
for bc in data["contacts"]:
by_id[bc["id"]] = bc
full = f"{bc.get('first_name') or ''} {bc.get('last_name') or ''}"
nm = " ".join(strip_accents(full).lower().split())
if nm:
name_idx.setdefault(nm, []).append(bc)
per_branch_name_idx[loc] = name_idx
per_branch_by_id[loc] = by_id
def _enrich_other_branches(global_matches_list):
"""Devuelve other_branches con id + tienda del verificador."""
out = []
for m in global_matches_list[:5]:
loc_id = m["_loc"]
vinfo = verifier_by_loc.get(loc_id) or {}
out.append({
"location_id": loc_id,
"name": m.get("_loc_name") or branch_data.get(loc_id, {}).get("name"),
"id": m["id"],
"tienda_value": vinfo.get("tienda_raw"),
})
return out
for c in brand_contacts:
tienda_value = extract_tienda_from_custom_fields(
c.get("custom_fields_json"), brand_tienda_field_id
)
sucursal_value = extract_tienda_from_custom_fields(
c.get("custom_fields_json"), brand_sucursal_field_id
)
tienda_norm = normalize_tienda(tienda_value) if tienda_value else None
target_loc = verifier_by_tienda.get(tienda_norm) if tienda_norm else None
# Resolution source: "tienda" | "sucursal_exact" | "sucursal_substring" | None
target_loc_source = "tienda" if target_loc else None
target_loc_match_kind = None
# Segundo check: si TIENDA no resolvio, intentar mapear via Sucursal
# con matching tolerante (abreviaturas + substring).
if not target_loc and sucursal_value:
fb_loc, fb_kind = resolve_location_from_sucursal(sucursal_value, verifier_by_loc)
if fb_loc:
target_loc = fb_loc
target_loc_source = f"sucursal_{fb_kind}"
target_loc_match_kind = fb_kind
# Match global contra todas las sucursales (informativo)
global_matches = find_match(c, branch_idx_phone, branch_idx_email, branch_idx_name)
if global_matches:
brand_present_in_any_branch += 1
else:
brand_not_in_any_branch.append({
**fmt_contact(c),
"tienda": tienda_value,
"sucursal": sucursal_value,
"expected_location_id": target_loc,
"expected_branch_name": branch_data.get(target_loc, {}).get("name") if target_loc else None,
"resolution_source": target_loc_source,
"opps_in_brand": len(brand_opps_by_cid.get(c["id"], [])),
})
if not tienda_value:
# Enriquecer con datos del CF Sucursal para que el dashboard pueda
# ofrecer "Llenar TIENDA desde Sucursal". Solo es resoluble si
# Sucursal mapea a una sucursal del verificador con TIENDA poblada.
sucursal_resolved_loc = None
sucursal_resolved_kind = None
expected_tienda = None
expected_branch_name = None
if sucursal_value:
fb_loc, fb_kind = resolve_location_from_sucursal(
sucursal_value, verifier_by_loc
)
if fb_loc:
sucursal_resolved_loc = fb_loc
sucursal_resolved_kind = fb_kind
vinfo = verifier_by_loc.get(fb_loc) or {}
expected_tienda = vinfo.get("tienda_raw")
expected_branch_name = branch_data.get(fb_loc, {}).get("name")
brand_without_tienda.append({
**fmt_contact(c),
"sucursal": sucursal_value or "",
"sucursal_resolved_location_id": sucursal_resolved_loc,
"sucursal_resolution_kind": sucursal_resolved_kind,
"expected_tienda": expected_tienda,
"expected_branch_name": expected_branch_name,
"looks_like_test": looks_like_test_contact(c),
})
continue
if not target_loc:
brand_with_unknown_tienda.append({
**fmt_contact(c),
"tienda": tienda_value,
})
continue
# Buscar en la sucursal asignada
idx = per_branch_idx.get(target_loc)
if not idx:
# La sucursal asignada esta filtrada por demo o no esta cacheada.
continue
branch_phone_idx, branch_email_idx, branch_name_idx_loc = idx
in_assigned = find_match(c, branch_phone_idx, branch_email_idx, branch_name_idx_loc)
if not in_assigned:
# Si la sucursal asignada es una "shell" absorbida por un hub
# digital (Toluca/Metepec/Lerma -> Pilares), aceptar la presencia
# en el hub como correcta. Evita ~82 falsos positivos.
hub_loc = DIGITAL_HUB_BY_SHELL.get(target_loc)
hub_idx = per_branch_idx.get(hub_loc) if hub_loc else None
if hub_idx and find_match(c, hub_idx[0], hub_idx[1], hub_idx[2]):
in_assigned = True
if in_assigned:
continue # Esta donde corresponde, sin discrepancia.
opps_in_brand = len(brand_opps_by_cid.get(c["id"], []))
other_branches_enriched = _enrich_other_branches(global_matches)
if global_matches:
# Caso B: esta en OTRA sucursal (no la asignada).
present_in_other_branch_not_assigned.append({
**fmt_contact(c),
"tienda": tienda_value,
"expected_location_id": target_loc,
"expected_branch_name": branch_data[target_loc]["name"],
"opps_in_brand": opps_in_brand,
"other_branches": other_branches_enriched,
})
else:
# Caso A o Caso D: el contacto Marca no matcheó con NINGUNA
# sucursal por phone/email. Buscar homónimos exactos por nombre
# en la sucursal asignada para decidir el sub-caso.
brand_full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
brand_name_norm = " ".join(strip_accents(brand_full).lower().split())
candidates_by_name = []
if brand_name_norm:
candidates_by_name = per_branch_name_idx.get(target_loc, {}).get(brand_name_norm, []) or []
# Caso D: probable duplicado en Marca. El contacto Marca SÍ tiene
# phone/email, no matcheo en ninguna sucursal, pero hay un homónimo
# con identificadores fuertes en la sucursal asignada. Lo más
# probable: en sucursal está el contacto bueno y en Marca quedó
# un registro extra con otro número/email.
brand_phone_norm = normalize_phone(c.get("phone"))
brand_email_norm = normalize_email(c.get("email"))
brand_has_strong_id = bool(brand_phone_norm or brand_email_norm)
candidate_with_strong_id = None
for cand in candidates_by_name:
if normalize_phone(cand.get("phone")) or normalize_email(cand.get("email")):
candidate_with_strong_id = cand
break
if brand_has_strong_id and candidate_with_strong_id:
probable_duplicate_in_brand.append({
**fmt_contact(c),
"tienda": tienda_value,
"expected_location_id": target_loc,
"expected_branch_name": branch_data[target_loc]["name"],
"opps_in_brand": opps_in_brand,
"branch_existing_contact": {
"id": candidate_with_strong_id["id"],
"phone": candidate_with_strong_id.get("phone"),
"email": candidate_with_strong_id.get("email"),
"first_name": candidate_with_strong_id.get("first_name"),
"last_name": candidate_with_strong_id.get("last_name"),
},
"homonyms_in_branch_count": len(candidates_by_name),
})
else:
# Caso A: contacto genuinamente ausente. Mantener candidato
# por nombre para el botón de update-branch-from-brand
# cuando aplique (caso típico: ambos sin phone/email).
target_candidate = None
if len(candidates_by_name) == 1:
cand = candidates_by_name[0]
target_candidate = {
"id": cand["id"],
"phone": cand.get("phone"),
"email": cand.get("email"),
"first_name": cand.get("first_name"),
"last_name": cand.get("last_name"),
}
# Fuzzy: explora otras sucursales con modalidades permisivas
# (teléfono parcial, email canónico/local, primer-apellido).
# Es puramente informativo — el contacto sigue en este bucket.
fuzzy = find_fuzzy_matches(
c,
branch_fuzzy_indexes,
exclude_ids=None, # aquí global_matches está vacío por definición
strict_match_phone=normalize_phone(c.get("phone")),
strict_match_email=normalize_email(c.get("email")),
)
# Enriquece cada match con la TIENDA del verificador para
# dar contexto al operador (a qué sucursal apunta cada uno).
for fm in fuzzy:
loc_id_fm = fm.get("location_id")
if loc_id_fm:
vinfo = verifier_by_loc.get(loc_id_fm) or {}
fm["location_tienda"] = vinfo.get("tienda_raw")
missing_in_assigned_branch.append({
**fmt_contact(c),
"tienda": tienda_value,
"expected_location_id": target_loc,
"expected_branch_name": branch_data[target_loc]["name"],
"opps_in_brand": opps_in_brand,
"branch_target_candidate": target_candidate,
"branch_target_candidates_count": len(candidates_by_name),
"fuzzy_matches": fuzzy,
})
# ----------------------------------------------------------------------
# 3) Oportunidades sin contraparte
# ----------------------------------------------------------------------
# Para cada opp de sucursal, ver si su contacto tiene match en Marca y si
# ese contact Marca tiene al menos 1 opp. Si no, es opp ausente en Marca.
missing_opps_in_brand = []
for loc, data in branch_data.items():
bc_by_id = {c["id"]: c for c in data["contacts"]}
for o in data["opps"]:
# Criterio PRINCIPAL: match por el campo "ID Oportunidad Sucursal".
# Si existe una opp de Marca cuyo valor de ese campo == el id nativo
# de esta opp de sucursal, está replicada. Compara cada opp de forma
# individual -> detecta el gap multi-empeño. Si no hay match por
# campo, cae al respaldo por contacto (lógica histórica de abajo).
if o.get("id") in brand_opps_by_link:
continue
contact = bc_by_id.get(o.get("contact_id"))
if not contact:
# opp huerfana sin contacto cacheado en sucursal -> reportar pero como anomaly
missing_opps_in_brand.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"monetary_value": o.get("monetary_value") or 0,
"branch_location_id": loc,
"branch_name": data["name"],
"contact_id": o.get("contact_id") or "",
"contact_name": "(contacto no cacheado)",
"contact_phone": "",
"contact_email": "",
"reason": "contacto_huerfano",
})
continue
marca_matches = find_match(contact, brand_idx_phone, brand_idx_email, brand_idx_name)
if not marca_matches:
missing_opps_in_brand.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"monetary_value": o.get("monetary_value") or 0,
"branch_location_id": loc,
"branch_name": data["name"],
"contact_id": contact["id"],
"contact_name": fmt_contact(contact)["name"],
"contact_phone": contact.get("phone") or "",
"contact_email": contact.get("email") or "",
"reason": "contacto_no_en_marca",
})
continue
# contacto SI esta en Marca, ver si tiene opps replicadas
has_brand_opp = any(brand_opps_by_cid.get(m["id"]) for m in marca_matches)
if not has_brand_opp:
missing_opps_in_brand.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"monetary_value": o.get("monetary_value") or 0,
"branch_location_id": loc,
"branch_name": data["name"],
"contact_id": contact["id"],
"contact_name": fmt_contact(contact)["name"],
"contact_phone": contact.get("phone") or "",
"contact_email": contact.get("email") or "",
"reason": "opp_no_replicada",
})
# ----------------------------------------------------------------------
# 3b) Oportunidades con el campo "ID Oportunidad Sucursal" vacío o inválido
# ----------------------------------------------------------------------
# El valor debe ser el id nativo de la opp (20 chars alfanuméricos). Vacío
# o len != 20 => inválido. Sucursales son accionables (botón de llenado =
# su propio id); Marca es informativo (su campo se resuelve por matcheo/sync).
opps_missing_id_field = []
def _classify_link(value):
if not value:
return "vacio"
if not OPP_ID_PATTERN.match(str(value)):
return "longitud_invalida"
return None
for o in brand_opps:
v = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
reason = _classify_link(v)
if reason is None:
continue
opps_missing_id_field.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"location_id": BRAND_LOCATION_ID,
"location_name": brand["nombre"],
"is_brand": True,
"field_value": v or "",
"field_len": len(str(v)) if v else 0,
"reason": reason,
})
for loc, data in branch_data.items():
branch_link_fid = resolve_opp_link_field_id(conn, loc)
for o in data["opps"]:
v = extract_opp_link_value(o.get("custom_fields_json"), branch_link_fid)
reason = _classify_link(v)
if reason is None:
continue
opps_missing_id_field.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"location_id": loc,
"location_name": data["name"],
"is_brand": False,
"field_value": v or "",
"field_len": len(str(v)) if v else 0,
"reason": reason,
})
# ----------------------------------------------------------------------
# 3b-bis) Réplicas DUPLICADAS en Marca (mismo "ID Oportunidad Sucursal")
# ----------------------------------------------------------------------
# Descuadre POSITIVO (Marca > sucursales): si dos o más opps de Marca
# comparten el MISMO valor de "ID Oportunidad Sucursal" (= apuntan a la
# misma opp de sucursal de origen) son réplicas duplicadas. Causa típica:
# el workflow n8n de sync de opps hace CREATE en vez de UPDATE (no encontró
# la opp existente al replicar). Es INVISIBLE para el bucket de huérfanas
# (que trata el link como salvaguarda y nunca verifica unicidad).
#
# Por cada cluster se recomienda conservar la canónica y borrar las
# sobrantes según la jerarquía de resolución de duplicados:
# (1) monetary_value mayor, (2) status activo (won/open) > lost/abandoned,
# (3) más antiguo > reciente [requiere createdAt en vivo -> el limpiador
# lo resuelve], (4) TIENDA. Cuando valor y status empatan se marca
# tie_break_needs_live_createdat=True para que el limpiador desempate.
opps_in_brand_duplicate_link = []
# Índice de opps de sucursal por id nativo -> (location_id, branch_name)
# para nombrar el origen del link de cada cluster.
branch_opp_owner_by_id = {}
for loc, data in branch_data.items():
for o in data["opps"]:
branch_opp_owner_by_id[o["id"]] = (loc, data["name"])
brand_contact_name_by_id = {c["id"]: fmt_contact(c)["name"] for c in brand_contacts}
# Agrupa TODAS las opps de Marca por su valor de link válido (20 chars).
brand_opps_link_groups = defaultdict(list)
for o in brand_opps:
v = extract_opp_link_value(o.get("custom_fields_json"), brand_opp_link_field_id)
if v and OPP_ID_PATTERN.match(str(v)):
brand_opps_link_groups[v].append(o)
_STATUS_RANK = {"won": 3, "open": 2, "lost": 1, "abandoned": 0}
duplicate_link_group_count = 0
duplicate_link_extra = 0 # opps sobrantes = sum(group_size - 1)
for link_value, group in brand_opps_link_groups.items():
if len(group) < 2:
continue
duplicate_link_group_count += 1
duplicate_link_extra += len(group) - 1
owner_loc, owner_name = branch_opp_owner_by_id.get(link_value, (None, None))
def _rank(o):
return (
float(o.get("monetary_value") or 0),
_STATUS_RANK.get((o.get("status") or "").lower(), 0),
)
ordered = sorted(group, key=_rank, reverse=True)
top, second = ordered[0], ordered[1]
tie = _rank(top) == _rank(second)
for idx, o in enumerate(ordered):
opps_in_brand_duplicate_link.append({
"id": o["id"],
"name": o.get("name") or "",
"status": o.get("status") or "",
"monetary_value": o.get("monetary_value") or 0,
"contact_id": o.get("contact_id") or "",
"contact_name": brand_contact_name_by_id.get(o.get("contact_id"), ""),
"link_value": link_value,
"branch_opp_id": link_value,
"branch_location_id": owner_loc or "",
"branch_name": owner_name or "(sucursal no cacheada)",
"group_size": len(group),
"recommended_action": "keep" if idx == 0 else "delete",
"tie_break_needs_live_createdat": tie,
})
# ----------------------------------------------------------------------
# 3c) Contactos con el campo "ID Contacto Sucursal" vacío o inválido
# ----------------------------------------------------------------------
# Paralelo al bucket de opps. Sucursales son accionables (botón llenado =
# contact.id propio); Marca es informativo (su campo se resuelve por
# matcheo/sync workflow, no se llena manualmente).
contacts_missing_id_field = []
for c in brand_contacts:
v = extract_contact_link_value(c.get("custom_fields_json"), brand_contact_link_field_id)
reason = _classify_link(v)
if reason is None:
continue
contacts_missing_id_field.append({
"id": c["id"],
"first_name": c.get("first_name") or "",
"last_name": c.get("last_name") or "",
"phone": c.get("phone") or "",
"email": c.get("email") or "",
"location_id": BRAND_LOCATION_ID,
"location_name": brand["nombre"],
"is_brand": True,
"field_value": v or "",
"field_len": len(str(v)) if v else 0,
"reason": reason,
})
for loc, data in branch_data.items():
branch_contact_link_fid = resolve_contact_link_field_id(conn, loc)
for c in data["contacts"]:
v = extract_contact_link_value(c.get("custom_fields_json"), branch_contact_link_fid)
reason = _classify_link(v)
if reason is None:
continue
contacts_missing_id_field.append({
"id": c["id"],
"first_name": c.get("first_name") or "",
"last_name": c.get("last_name") or "",
"phone": c.get("phone") or "",
"email": c.get("email") or "",
"location_id": loc,
"location_name": data["name"],
"is_brand": False,
"field_value": v or "",
"field_len": len(str(v)) if v else 0,
"reason": reason,
})
# ----------------------------------------------------------------------
# 4) Desglose por sucursal
# ----------------------------------------------------------------------
for loc, data in branch_data.items():
per_branch_summary.append({
"location_id": loc,
"name": data["name"],
"contacts": len(data["contacts"]),
"opportunities": len(data["opps"]),
})
per_branch_summary.sort(key=lambda x: x["name"])
# ----------------------------------------------------------------------
# 5) Duplicados intra-Marca: mismo nombre normalizado, sin phone NI email
# ----------------------------------------------------------------------
intra_brand_duplicates = []
name_groups = defaultdict(list)
for c in brand_contacts:
if normalize_phone(c.get("phone")) or normalize_email(c.get("email")):
continue
full_name = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
n = " ".join(strip_accents(full_name).lower().split())
if not n:
continue
name_groups[n].append(c)
# Pre-indexar contactos sucursal por name_norm (solo los que no tienen
# phone ni email), para encontrar candidatos de sync para "unico restante".
branch_no_pe_by_name = defaultdict(list)
for loc, data in branch_data.items():
from collections import Counter as _Counter
opps_by_cid = _Counter(o.get("contact_id") for o in data["opps"] if o.get("contact_id"))
for c in data["contacts"]:
if normalize_phone(c.get("phone")) or normalize_email(c.get("email")):
continue
full = f"{c.get('first_name') or ''} {c.get('last_name') or ''}"
nm = " ".join(strip_accents(full).lower().split())
if not nm:
continue
branch_no_pe_by_name[nm].append({
"id": c["id"],
"location_id": loc,
"branch_name": data["name"],
"opps_count": opps_by_cid.get(c["id"], 0),
})
group_count = 0
for name_norm, ccs in name_groups.items():
if len(ccs) < 2:
continue
group_count += 1
candidates = branch_no_pe_by_name.get(name_norm, [])
# Ordenar por fecha_added desc, ponemos primero los recientes
sorted_ccs = sorted(ccs, key=lambda x: (x.get("date_added") or ""), reverse=True)
for c in sorted_ccs:
intra_brand_duplicates.append({
**fmt_contact(c),
"name_norm": name_norm,
"group_size": len(ccs),
"opps_in_brand": len(brand_opps_by_cid.get(c["id"], [])),
"date_added": c.get("date_added") or "",
"branch_candidates": candidates,
})
# ----------------------------------------------------------------------
# Resumen
# ----------------------------------------------------------------------
contact_diff = len(brand_contacts) - total_branch_contacts
opp_diff = len(brand_opps) - total_branch_opps
def maybe_limit(lst):
if limit_missing is None or len(lst) <= limit_missing:
return lst, len(lst), False
return lst[:limit_missing], len(lst), True
missing_in_brand_lim, mib_total, mib_trunc = maybe_limit(missing_in_brand)
missing_in_assigned_lim, mia_total, mia_trunc = maybe_limit(missing_in_assigned_branch)
present_other_lim, pother_total, pother_trunc = maybe_limit(present_in_other_branch_not_assigned)
probable_dup_lim, probable_dup_total, probable_dup_trunc = maybe_limit(probable_duplicate_in_brand)
brand_without_tienda_lim, bwt_total, bwt_trunc = maybe_limit(brand_without_tienda)
brand_unknown_tienda_lim, but_total, but_trunc = maybe_limit(brand_with_unknown_tienda)
brand_not_any_lim, bna_total, bna_trunc = maybe_limit(brand_not_in_any_branch)
missing_opps_lim, mo_total, mo_trunc = maybe_limit(missing_opps_in_brand)
opps_missing_id_lim, omif_total, omif_trunc = maybe_limit(opps_missing_id_field)
dup_link_lim, dup_link_total, dup_link_trunc = maybe_limit(opps_in_brand_duplicate_link)
contacts_missing_id_lim, cmif_total, cmif_trunc = maybe_limit(contacts_missing_id_field)
dup_lim, dup_total, dup_trunc = maybe_limit(intra_brand_duplicates)
return {
"totals": {
"brand": {
"name": brand["nombre"],
"location_id": BRAND_LOCATION_ID,
"contacts": len(brand_contacts),
"opportunities": len(brand_opps),
},
"branches_aggregate": {
"branch_count": len(branches),
"contacts": total_branch_contacts,
"opportunities": total_branch_opps,
},
"diff": {
"contacts": contact_diff,
"opportunities": opp_diff,
"contacts_match": contact_diff == 0,
"opportunities_match": opp_diff == 0,
},
},
"demos_excluded": [
{"location_id": d["location_id"], "name": d["nombre"]} for d in demos
],
"per_branch": per_branch_summary,
"missing": {
"contacts_in_branch_not_in_brand": {
"total": mib_total,
"items": missing_in_brand_lim,
"truncated": mib_trunc,
},
"contacts_in_brand_not_in_assigned_branch": {
"total": mia_total,
"items": missing_in_assigned_lim,
"truncated": mia_trunc,
},
"contacts_in_brand_present_in_other_branch_not_assigned": {
"total": pother_total,
"items": present_other_lim,
"truncated": pother_trunc,
},
"contacts_in_brand_probable_duplicate": {
"total": probable_dup_total,
"items": probable_dup_lim,
"truncated": probable_dup_trunc,
},
"contacts_in_brand_without_tienda": {
"total": bwt_total,
"items": brand_without_tienda_lim,
"truncated": bwt_trunc,
},
"contacts_in_brand_with_unknown_tienda": {
"total": but_total,
"items": brand_unknown_tienda_lim,
"truncated": but_trunc,
},
"contacts_in_brand_not_in_any_branch": {
"total": bna_total,
"items": brand_not_any_lim,
"truncated": bna_trunc,
},
"opportunities_in_branch_not_in_brand": {
"total": mo_total,
"items": missing_opps_lim,
"truncated": mo_trunc,
},
"opportunities_missing_id_field": {
"total": omif_total,
"items": opps_missing_id_lim,
"truncated": omif_trunc,
},
"opportunities_in_brand_duplicate_link": {
"total": dup_link_total,
"items": dup_link_lim,
"truncated": dup_link_trunc,
"group_count": duplicate_link_group_count,
"extra_opps": duplicate_link_extra,
},
"contacts_missing_id_field": {
"total": cmif_total,
"items": contacts_missing_id_lim,
"truncated": cmif_trunc,
},
"intra_brand_duplicates": {
"total": dup_total,
"items": dup_lim,
"truncated": dup_trunc,
"group_count": group_count,
},
},
"meta": {
"brand_tienda_field_id": brand_tienda_field_id,
"verifier_loaded": bool(verifier_by_loc),
"verifier_entries": len(verifier_by_loc),
"brand_present_in_any_branch": brand_present_in_any_branch,
},
}
finally:
conn.close()
# ---------------------------------------------------------------------------
# CLI / impresion
# ---------------------------------------------------------------------------
def print_report(data, show_missing=False, missing_cap=20):
t = data["totals"]
b = t["brand"]
a = t["branches_aggregate"]
d = t["diff"]
safe_print("=" * 72)
safe_print("COMPARATIVA MARCA vs SUCURSALES (excluye cuentas demo)")
safe_print("=" * 72)
safe_print(f" Marca: {b['name']} ({b['location_id']})")
safe_print(f" Contactos : {b['contacts']:>8}")
safe_print(f" Oportunidades : {b['opportunities']:>8}")
safe_print(f" Sucursales activas: {a['branch_count']}")
safe_print(f" Contactos suma : {a['contacts']:>8}")
safe_print(f" Oportunidades : {a['opportunities']:>8}")
safe_print("-" * 72)
status_c = "OK (iguales)" if d["contacts_match"] else f"DESCUADRE: {d['contacts']:+}"
status_o = "OK (iguales)" if d["opportunities_match"] else f"DESCUADRE: {d['opportunities']:+}"
safe_print(f" Diff contactos : {status_c}")
safe_print(f" Diff oportunidades : {status_o}")
demos = data["demos_excluded"]
if demos:
safe_print("-" * 72)
safe_print(f" Cuentas demo excluidas ({len(demos)}):")
for d_acc in demos:
safe_print(f" - {d_acc['name']} ({d_acc['location_id']})")
safe_print("=" * 72)
safe_print("Desglose por sucursal:")
safe_print(f" {'Sucursal':<45} {'Cont.':>8} {'Opps':>8}")
safe_print(" " + "-" * 64)
for row in data["per_branch"]:
name = row["name"][:44]
safe_print(f" {name:<45} {row['contacts']:>8} {row['opportunities']:>8}")
m = data["missing"]
safe_print("=" * 72)
safe_print("Resumen de huecos detectados:")
safe_print(f" Contactos en sucursal sin contraparte en Marca : {m['contacts_in_branch_not_in_brand']['total']}")
safe_print(f" Contactos en Marca sin presencia en la sucursal asignada : {m['contacts_in_brand_not_in_assigned_branch']['total']}")
safe_print(f" Probables duplicados en Marca (homonimo en sucursal) : {m['contacts_in_brand_probable_duplicate']['total']}")
safe_print(f" Contactos en Marca sin TIENDA poblada : {m['contacts_in_brand_without_tienda']['total']}")
safe_print(f" Contactos en Marca con TIENDA desconocida en verificador : {m['contacts_in_brand_with_unknown_tienda']['total']}")
safe_print(f" Contactos en Marca sin contraparte en NINGUNA sucursal : {m['contacts_in_brand_not_in_any_branch']['total']}")
safe_print(f" Oportunidades en sucursal sin replica en Marca : {m['opportunities_in_branch_not_in_brand']['total']}")
_dup = m.get("opportunities_in_brand_duplicate_link", {})
safe_print(f" Replicas DUPLICADAS en Marca (mismo ID Opp Sucursal) : {_dup.get('total', 0)} en {_dup.get('group_count', 0)} grupos ({_dup.get('extra_opps', 0)} sobrantes)")
if show_missing:
def dump(title, key, formatter):
block = m[key]
if not block["total"]:
return
safe_print("-" * 72)
safe_print(f" {title} ({block['total']} total, mostrando hasta {missing_cap}):")
for item in block["items"][:missing_cap]:
safe_print(" - " + formatter(item))
dump(
"Contactos en sucursal no presentes en Marca",
"contacts_in_branch_not_in_brand",
lambda i: f"{i['name']} | {i['phone'] or i['email'] or '(sin contacto)'} | sucursal: {i['branch_name']} | opps locales: {i['opps_in_branch']}",
)
dump(
"Contactos en Marca ausentes de su sucursal asignada",
"contacts_in_brand_not_in_assigned_branch",
lambda i: f"{i['name']} | tienda='{i.get('tienda')}' | esperado: {i['expected_branch_name']} | opps en marca: {i['opps_in_brand']} | esta en otra sucursal: {i['present_in_other_branch']}",
)
dump(
"Probables duplicados en Marca (existe homonimo con tel/email en sucursal asignada)",
"contacts_in_brand_probable_duplicate",
lambda i: f"{i['name']} | tienda='{i.get('tienda')}' | esperado: {i['expected_branch_name']} | brand_phone={i.get('phone')!r} | branch_phone={(i.get('branch_existing_contact') or {}).get('phone')!r}",
)
dump(
"Oportunidades en sucursal sin replica en Marca",
"opportunities_in_branch_not_in_brand",
lambda i: f"{i['name']} [{i['status']}] | ${i['monetary_value']:.0f} | contacto: {i['contact_name']} | sucursal: {i['branch_name']} | motivo: {i['reason']}",
)
dump(
"Replicas DUPLICADAS en Marca (mismo ID Oportunidad Sucursal)",
"opportunities_in_brand_duplicate_link",
lambda i: f"{i['recommended_action'].upper():>6} | {i['name']} [{i['status']}] | ${i['monetary_value']:.0f} | opp={i['id']} | link={i['link_value']} | origen: {i['branch_name']} | grupo de {i['group_size']}",
)
def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--show-missing", action="store_true", help="Imprime los listados de ausentes.")
parser.add_argument("--limit-missing", type=int, default=None, help="Limita el listado interno antes de imprimir (default sin limite).")
parser.add_argument("--missing-cap", type=int, default=20, help="Cuantos items imprimir por listado cuando --show-missing.")
parser.add_argument("--json", action="store_true", help="Imprime el resultado como JSON en vez del reporte humano.")
args = parser.parse_args()
try:
data = run_audit(limit_missing=args.limit_missing)
except FileNotFoundError as e:
safe_print(f"ERROR: {e}")
sys.exit(2)
except RuntimeError as e:
safe_print(f"ERROR: {e}")
sys.exit(3)
if args.json:
safe_print(json.dumps(data, ensure_ascii=False, indent=2))
return
print_report(data, show_missing=args.show_missing, missing_cap=args.missing_cap)
if __name__ == "__main__":
main()