#!/usr/bin/env python3 # -*- coding: utf-8 -*- """audit_brand_sucursal_vs_form.py Read-only. Compara el campo 'Sucursal' que el contacto envio originalmente en el formulario (cuenta de Marca) contra el valor actual del campo 'Sucursal' en el contacto de Marca. Usa similitud difusa para tolerar abreviaciones de estado de versiones viejas del form (ej. 'QRO' vs 'Queretaro, Queretaro'). Requisitos: 1. Haber corrido el sync global (Sincronizar Todo) para tener contactos. 2. Haber corrido `python scripts/sync_forms_brand.py` para tener form_submissions en SQLite. Buckets (por defecto): - OK (similitud >= 0.60): el form y Marca apuntan a la misma sucursal o a una abreviacion razonable. - VERIFICAR (0.30 <= similitud < 0.60): nombres parecidos pero no identicos; conviene revisar manualmente. - DISCREPANCIA (similitud < 0.30): el contacto deberia estar en otra sucursal segun lo que el mismo cliente puso en el formulario. Casos especiales: - Contacto sin campo Sucursal en Marca: bucket CONTACTO_SIN_SUCURSAL. - Submission con sucursal_value vacio: se ignora (no aporta evidencia). - Multiples submissions por contacto: usamos el ULTIMO (createdAt mas reciente) porque refleja la intencion mas actual del cliente. Uso: python scripts/audit_brand_sucursal_vs_form.py python scripts/audit_brand_sucursal_vs_form.py --filter-marca queretaro python scripts/audit_brand_sucursal_vs_form.py --show all python scripts/audit_brand_sucursal_vs_form.py --xlsx """ import argparse import json import os import re import sys import unicodedata from collections import Counter, defaultdict from datetime import datetime from difflib import SequenceMatcher ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if ROOT_DIR not in sys.path: sys.path.insert(0, ROOT_DIR) import db # noqa: E402 from common import REPORT_DRIFT # noqa: E402 BRAND_LOCATION_ID = "GbKkBpCmKu2QmloKFHy3" EXPORTS_DIR = REPORT_DRIFT # mantenemos el nombre local por compat con el resto del script # --- Helpers de normalizacion ------------------------------------------------- # Estados mexicanos con abreviaciones comunes que aparecen en forms viejos. STATE_ABBR = { "ags": "aguascalientes", "bc": "baja california", "bcs": "baja california sur", "camp": "campeche", "chis": "chiapas", "chih": "chihuahua", "coah": "coahuila", "col": "colima", "cdmx": "ciudad de mexico", "df": "ciudad de mexico", "dgo": "durango", "edomex": "estado de mexico", "mex": "estado de mexico", "gto": "guanajuato", "gro": "guerrero", "hgo": "hidalgo", "jal": "jalisco", "mich": "michoacan", "mor": "morelos", "nay": "nayarit", "nl": "nuevo leon", "oax": "oaxaca", "pue": "puebla", "qro": "queretaro", "qroo": "quintana roo", "slp": "san luis potosi", "sin": "sinaloa", "son": "sonora", "tab": "tabasco", "tamps": "tamaulipas", "tlax": "tlaxcala", "ver": "veracruz", "yuc": "yucatan", "zac": "zacatecas", } def strip_accents(text): if not text: return "" t = unicodedata.normalize("NFKD", str(text)) return "".join(ch for ch in t if not unicodedata.combining(ch)) def normalize_sucursal(value): """Normaliza un string de sucursal: - minusculas, sin acentos, sin puntuacion - expande abreviaciones de estado conocidas - resultado es lista de tokens unicos ordenados """ if value is None: return "" text = strip_accents(str(value)).lower() text = re.sub(r"[^\w\s]", " ", text) tokens = text.split() expanded = [] for tok in tokens: if tok in STATE_ABBR: expanded.extend(STATE_ABBR[tok].split()) else: expanded.append(tok) # quitar duplicados y stopwords irrelevantes stop = {"de", "la", "el", "los", "las", "mp", "monte", "providencia"} cleaned = [t for t in expanded if t and t not in stop] return " ".join(cleaned) def similarity(a, b): """Devuelve un ratio 0-1. Prioriza overlap de tokens (Jaccard) sobre coincidencia caracter-a-caracter, porque dos sucursales completamente distintas (Puebla vs Queretaro) pueden compartir letras pero ningun token. Reglas: - Mismo string normalizado -> 1.0 - Jaccard de tokens == 0 (disjuntas) -> max 0.25 (forzamos DISCREPANCIA) - Si tokens se sobreponen -> promedio 50/50 Jaccard + SequenceMatcher """ na, nb = normalize_sucursal(a), normalize_sucursal(b) if not na or not nb: return 0.0 if na == nb: return 1.0 ta, tb = set(na.split()), set(nb.split()) if not ta or not tb: return 0.0 jaccard = len(ta & tb) / len(ta | tb) seq = SequenceMatcher(None, na, nb).ratio() if jaccard == 0: # Sin ningun token en comun -> claramente sucursales distintas. # Cap a 0.25 para que caiga en DISCREPANCIA. Conservamos algo de seq # para distinguir errores tipograficos cercanos de errores burdos. return min(0.25, seq * 0.3) return 0.5 * jaccard + 0.5 * seq # --- Clasificacion de origen del contacto ------------------------------------ # Regla (priorizada de mas a menos confiable): # 1. Si tiene submission en form_submissions + tag 'formulario' -> FORMULARIO (evidencia dura) # 2. Si tiene tag 'formulario' (sin submission registrado) -> FORMULARIO_SIN_RASTRO # 3. Si tiene tag 'sucursal' (sin tag 'formulario') -> SUCURSAL # 4. Si source nativo contiene 'web user' -> SUCURSAL (confirma sucursal) # 5. Si source contiene facebook/instagram/ads/google/lead -> DIGITAL # 6. Si source contiene 'integration' -> INTEGRATION (probable reajuste) # 7. Sin senales claras -> DESCONOCIDO DIGITAL_KEYWORDS = ("facebook", "instagram", "google", "tiktok", "youtube", "ads", "lead", "landing", "messenger", "linkedin") def classify_origin(contact_tags, contact_source, has_form_submission): tags_lower = {str(t).strip().lower() for t in (contact_tags or []) if t} source_lower = str(contact_source or "").strip().lower() has_form_tag = "formulario" in tags_lower has_suc_tag = "sucursal" in tags_lower if has_form_tag and has_form_submission: return "FORMULARIO" if has_form_tag: return "FORMULARIO_SIN_RASTRO" if has_suc_tag and "formulario" not in tags_lower: return "SUCURSAL" if "web user" in source_lower or "webuser" in source_lower: return "SUCURSAL" if "sucursal" in source_lower: return "SUCURSAL" if "form" in source_lower: # source = "Formulario" / "Formulario - Sitio Web" sin tag formulario return "FORMULARIO_SIN_RASTRO" if not has_form_submission else "FORMULARIO" if any(k in source_lower for k in DIGITAL_KEYWORDS): return "DIGITAL" if "integration" in source_lower: return "INTEGRATION" return "DESCONOCIDO" def origin_confidence(origin): """Cuanto pesa la evidencia que aporta el origen al juicio sobre Sucursal.""" return { "FORMULARIO": "alta", # el cliente mismo eligio la sucursal "FORMULARIO_SIN_RASTRO": "media", "SUCURSAL": "alta", # se creo manualmente en esa sucursal "DIGITAL": "media", "INTEGRATION": "baja", # probable reajuste historico "DESCONOCIDO": "baja", }.get(origin, "baja") def parse_tags(tags_value): """tags se guarda como JSON string en SQLite.""" if not tags_value: return [] if isinstance(tags_value, list): return tags_value try: parsed = json.loads(tags_value) return parsed if isinstance(parsed, list) else [] except Exception: return [] # --- Extraccion de Sucursal del contacto en Marca ---------------------------- SUCURSAL_CANONICAL = "sucursal" def extract_custom_value(field): for k in ("value", "fieldValueString", "fieldValueDate", "fieldValueNumber", "fieldValueArray", "fieldValueOptions", "fieldValueFile"): v = field.get(k) if v is None: continue if isinstance(v, str) and not v.strip(): continue if isinstance(v, (list, dict)) and not v: continue return v return None def resolve_sucursal_field_id_from_db(location_id): """Lee object_schemas (poblada por el sync) para mapear nombre -> id sin pegarle a GHL otra vez.""" conn = db.get_db_connection() try: rows = conn.execute( "SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'", (location_id,) ).fetchall() for r in rows: if strip_accents(r["field_name"]).lower().strip() == SUCURSAL_CANONICAL: return r["field_id"] return None finally: conn.close() def get_contact_sucursal(contact_row, sucursal_field_id): if not sucursal_field_id: return None try: cfs = json.loads(contact_row.get("custom_fields_json") or "[]") except Exception: return None if not isinstance(cfs, list): return None for f in cfs: if not isinstance(f, dict): continue fid = f.get("id") or f.get("fieldId") if fid == sucursal_field_id: v = extract_custom_value(f) if isinstance(v, list): return ", ".join(str(x) for x in v if x is not None) or None if v is not None: return str(v).strip() or None return None # --- Buckets y reporte ------------------------------------------------------- def categorize(sim, ok_threshold, verify_threshold): if sim >= ok_threshold: return "OK" if sim >= verify_threshold: return "VERIFICAR" return "DISCREPANCIA" def latest_submission_per_contact(submissions): """Mantiene la submission mas reciente por contact_id (cuando trae sucursal).""" by_contact = {} for s in submissions: cid = s.get("contact_id") if not cid: continue if not s.get("sucursal_value"): continue prev = by_contact.get(cid) if prev is None or (s.get("created_at") or "") > (prev.get("created_at") or ""): by_contact[cid] = s return by_contact def main(): parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) parser.add_argument("--location", default=BRAND_LOCATION_ID, help=f"Location a auditar. Default: Marca ({BRAND_LOCATION_ID})") parser.add_argument("--ok-threshold", type=float, default=0.60, help="Similitud minima para considerar OK. Default 0.60") parser.add_argument("--verify-threshold", type=float, default=0.30, help="Similitud minima para 'verificar'. bucket -> count def build_row(contact, cid, sub): marca_val = get_contact_sucursal(contact, sucursal_fid) tags = parse_tags(contact.get("tags")) source = contact.get("source") has_sub = cid in contact_ids_with_submission origin = classify_origin(tags, source, has_sub) return { "contact_id": cid, "name": (contact.get("first_name") or "") + " " + (contact.get("last_name") or ""), "email": contact.get("email"), "phone": contact.get("phone"), "tags": tags, "source": source, "origin": origin, "origin_confidence": origin_confidence(origin), "marca_sucursal": marca_val, "form_sucursal": (sub.get("sucursal_value") if sub else None) or None, "similarity": None, "bucket": None, "submission_at": (sub.get("created_at") if sub else None), "submission_id": (sub.get("id") if sub else None), } # 4a. Contactos CON submission: comparamos form vs marca contacts_evaluated_via_form = set() for cid, sub in latest_by_contact.items(): contact = contacts_by_id.get(cid) form_val = sub.get("sucursal_value") or "" if not contact: buckets["CONTACTO_NO_EN_DB"].append({ "contact_id": cid, "name": sub.get("name"), "email": sub.get("email"), "phone": sub.get("phone"), "form_sucursal": form_val, "submission_at": sub.get("created_at"), "origin": "FORMULARIO", "origin_confidence": "alta", }) continue row_data = build_row(contact, cid, sub) if filter_norm and (not row_data["marca_sucursal"] or filter_norm not in strip_accents(row_data["marca_sucursal"]).lower()): continue if not row_data["marca_sucursal"]: row_data["bucket"] = "CONTACTO_SIN_SUCURSAL" buckets["CONTACTO_SIN_SUCURSAL"].append(row_data) origin_x_bucket[row_data["origin"]]["CONTACTO_SIN_SUCURSAL"] += 1 origin_counter[row_data["origin"]] += 1 contacts_evaluated_via_form.add(cid) continue sim = similarity(form_val, row_data["marca_sucursal"]) bucket = categorize(sim, args.ok_threshold, args.verify_threshold) row_data["similarity"] = round(sim, 3) row_data["bucket"] = bucket buckets[bucket].append(row_data) by_marca_sucursal[strip_accents(row_data["marca_sucursal"]).lower()][bucket] += 1 origin_x_bucket[row_data["origin"]][bucket] += 1 origin_counter[row_data["origin"]] += 1 contacts_evaluated_via_form.add(cid) # 4b. Contactos SIN submission: solo clasificamos por origen (no hay form # para comparar contra marca). Util para entender el universo total. for cid, contact in contacts_by_id.items(): if cid in contacts_evaluated_via_form: continue row_data = build_row(contact, cid, None) if filter_norm and (not row_data["marca_sucursal"] or filter_norm not in strip_accents(row_data["marca_sucursal"]).lower()): continue row_data["bucket"] = "SIN_SUBMISSION" buckets["SIN_SUBMISSION"].append(row_data) origin_x_bucket[row_data["origin"]]["SIN_SUBMISSION"] += 1 origin_counter[row_data["origin"]] += 1 # 5. resumen total_with_form = sum(len(buckets[k]) for k in ("OK", "VERIFICAR", "DISCREPANCIA", "CONTACTO_SIN_SUCURSAL")) total_overall = total_with_form + len(buckets["SIN_SUBMISSION"]) + len(buckets["CONTACTO_NO_EN_DB"]) print("\n" + "=" * 72) print("RESUMEN (con submission del formulario = evidencia dura)") print("=" * 72) print(f"Contactos con submission y evaluados: {total_with_form}") print(f" OK (>= {args.ok_threshold:.2f}): {len(buckets['OK'])}") print(f" VERIFICAR ({args.verify_threshold:.2f}-{args.ok_threshold:.2f}): {len(buckets['VERIFICAR'])}") print(f" DISCREPANCIA (< {args.verify_threshold:.2f}): {len(buckets['DISCREPANCIA'])}") print(f" Contacto sin Sucursal en Marca: {len(buckets['CONTACTO_SIN_SUCURSAL'])}") print(f"\nContactos SIN submission (clasificados solo por origen): {len(buckets['SIN_SUBMISSION'])}") print(f"Submission sin contacto en DB: {len(buckets['CONTACTO_NO_EN_DB'])}") print(f"\nTotal universo: {total_overall}") # Desglose por origen probable if origin_counter: print("\n" + "-" * 72) print("ORIGEN PROBABLE (tags + source nativo + submission)") print("-" * 72) for origin, n in origin_counter.most_common(): conf = origin_confidence(origin) breakdown = origin_x_bucket[origin] parts = [f"{b}={c}" for b, c in breakdown.most_common()] print(f" {origin:24s} ({conf:5s} confianza): {n:4d} [{', '.join(parts)}]") if by_marca_sucursal: print("\n" + "-" * 72) print("DESGLOSE POR SUCURSAL EN MARCA (solo contactos con submission)") print("-" * 72) for marca_suc, cnt in sorted(by_marca_sucursal.items(), key=lambda x: -sum(x[1].values())): total_suc = sum(cnt.values()) print(f" {marca_suc!r}: {total_suc} " f"(OK={cnt.get('OK',0)}, VERIF={cnt.get('VERIFICAR',0)}, DISCREP={cnt.get('DISCREPANCIA',0)})") # 6. detalle sections = [] if args.show == "all": sections = ["DISCREPANCIA", "VERIFICAR", "OK", "CONTACTO_SIN_SUCURSAL", "SIN_SUBMISSION", "CONTACTO_NO_EN_DB"] elif args.show == "discrepancia": sections = ["DISCREPANCIA"] elif args.show == "verificar": sections = ["DISCREPANCIA", "VERIFICAR"] origin_filter = set(o.strip().upper() for o in (args.origin or "").split(",") if o.strip()) for sec in sections: rows = buckets[sec] if origin_filter: rows = [r for r in rows if (r.get("origin") or "").upper() in origin_filter] if not rows: continue print("\n" + "-" * 72) print(f"{sec} ({len(rows)})") print("-" * 72) for r in rows: name = (r.get("name") or "").strip() or "(sin nombre)" sim_str = f"sim={r['similarity']:.2f}" if r.get("similarity") is not None else "sim=N/A" origin = r.get("origin", "?") conf = r.get("origin_confidence", "?") print(f" {name} [{sim_str}] origen={origin} ({conf})") print(f" contact_id: {r['contact_id']} phone={r.get('phone')!r} email={r.get('email')!r}") print(f" tags={r.get('tags')!r} source={r.get('source')!r}") print(f" Marca dice: {r.get('marca_sucursal')!r}") if r.get("form_sucursal"): print(f" Form dijo: {r['form_sucursal']!r} (submission_at={r.get('submission_at')})") else: print(f" Form: (sin submission registrado)") # 7. xlsx opcional if args.xlsx_path is not None: try: from openpyxl import Workbook except ImportError: print("\nWARN: openpyxl no esta instalado; salteo --xlsx") else: os.makedirs(EXPORTS_DIR, exist_ok=True) if args.xlsx_path: path = args.xlsx_path if not os.path.isabs(path) and not os.sep in path: base, ext = os.path.splitext(path) if not ext: ext = ".xlsx" path = os.path.join(EXPORTS_DIR, base + ext) else: ts = datetime.now().strftime("%Y%m%d_%H%M%S") path = os.path.join(EXPORTS_DIR, f"sucursal_vs_form_{ts}.xlsx") wb = Workbook() wb.remove(wb.active) header = ["bucket", "similarity", "origin", "origin_confidence", "contact_id", "name", "email", "phone", "tags", "source", "marca_sucursal", "form_sucursal", "submission_at", "submission_id"] for sec in ["DISCREPANCIA", "VERIFICAR", "OK", "CONTACTO_SIN_SUCURSAL", "SIN_SUBMISSION", "CONTACTO_NO_EN_DB"]: ws = wb.create_sheet(sec[:31]) ws.append(header) for r in buckets[sec]: ws.append([ r.get("bucket"), r.get("similarity"), r.get("origin"), r.get("origin_confidence"), r.get("contact_id"), r.get("name"), r.get("email"), r.get("phone"), ",".join(r.get("tags") or []) if isinstance(r.get("tags"), list) else r.get("tags"), r.get("source"), r.get("marca_sucursal"), r.get("form_sucursal"), r.get("submission_at"), r.get("submission_id"), ]) wb.save(path) print(f"\nExcel exportado: {path}") if __name__ == "__main__": main()