Files
MP-Manager/scripts/audit_brand_sucursal_vs_form.py
T
2026-05-30 14:31:19 -06:00

560 lines
23 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""audit_brand_sucursal_vs_form.py
Read-only. Compara el campo 'Sucursal' que el contacto envio originalmente en
el formulario (cuenta de Marca) contra el valor actual del campo 'Sucursal' en
el contacto de Marca. Usa similitud difusa para tolerar abreviaciones de
estado de versiones viejas del form (ej. 'QRO' vs 'Queretaro, Queretaro').
Requisitos:
1. Haber corrido el sync global (Sincronizar Todo) para tener contactos.
2. Haber corrido `python scripts/sync_forms_brand.py` para tener
form_submissions en SQLite.
Buckets (por defecto):
- OK (similitud >= 0.60): el form y Marca apuntan a la misma sucursal o
a una abreviacion razonable.
- VERIFICAR (0.30 <= similitud < 0.60): nombres parecidos pero no
identicos; conviene revisar manualmente.
- DISCREPANCIA (similitud < 0.30): el contacto deberia estar en otra
sucursal segun lo que el mismo cliente puso en el formulario.
Casos especiales:
- Contacto sin campo Sucursal en Marca: bucket CONTACTO_SIN_SUCURSAL.
- Submission con sucursal_value vacio: se ignora (no aporta evidencia).
- Multiples submissions por contacto: usamos el ULTIMO (createdAt mas
reciente) porque refleja la intencion mas actual del cliente.
Uso:
python scripts/audit_brand_sucursal_vs_form.py
python scripts/audit_brand_sucursal_vs_form.py --filter-marca queretaro
python scripts/audit_brand_sucursal_vs_form.py --show all
python scripts/audit_brand_sucursal_vs_form.py --xlsx
"""
import argparse
import json
import os
import re
import sys
import unicodedata
from collections import Counter, defaultdict
from datetime import datetime
from difflib import SequenceMatcher
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
import db # noqa: E402
from common import REPORT_DRIFT # noqa: E402
BRAND_LOCATION_ID = "GbKkBpCmKu2QmloKFHy3"
EXPORTS_DIR = REPORT_DRIFT # mantenemos el nombre local por compat con el resto del script
# --- Helpers de normalizacion -------------------------------------------------
# Estados mexicanos con abreviaciones comunes que aparecen en forms viejos.
STATE_ABBR = {
"ags": "aguascalientes",
"bc": "baja california",
"bcs": "baja california sur",
"camp": "campeche",
"chis": "chiapas",
"chih": "chihuahua",
"coah": "coahuila",
"col": "colima",
"cdmx": "ciudad de mexico",
"df": "ciudad de mexico",
"dgo": "durango",
"edomex": "estado de mexico",
"mex": "estado de mexico",
"gto": "guanajuato",
"gro": "guerrero",
"hgo": "hidalgo",
"jal": "jalisco",
"mich": "michoacan",
"mor": "morelos",
"nay": "nayarit",
"nl": "nuevo leon",
"oax": "oaxaca",
"pue": "puebla",
"qro": "queretaro",
"qroo": "quintana roo",
"slp": "san luis potosi",
"sin": "sinaloa",
"son": "sonora",
"tab": "tabasco",
"tamps": "tamaulipas",
"tlax": "tlaxcala",
"ver": "veracruz",
"yuc": "yucatan",
"zac": "zacatecas",
}
def strip_accents(text):
if not text:
return ""
t = unicodedata.normalize("NFKD", str(text))
return "".join(ch for ch in t if not unicodedata.combining(ch))
def normalize_sucursal(value):
"""Normaliza un string de sucursal:
- minusculas, sin acentos, sin puntuacion
- expande abreviaciones de estado conocidas
- resultado es lista de tokens unicos ordenados
"""
if value is None:
return ""
text = strip_accents(str(value)).lower()
text = re.sub(r"[^\w\s]", " ", text)
tokens = text.split()
expanded = []
for tok in tokens:
if tok in STATE_ABBR:
expanded.extend(STATE_ABBR[tok].split())
else:
expanded.append(tok)
# quitar duplicados y stopwords irrelevantes
stop = {"de", "la", "el", "los", "las", "mp", "monte", "providencia"}
cleaned = [t for t in expanded if t and t not in stop]
return " ".join(cleaned)
def similarity(a, b):
"""Devuelve un ratio 0-1. Prioriza overlap de tokens (Jaccard) sobre
coincidencia caracter-a-caracter, porque dos sucursales completamente
distintas (Puebla vs Queretaro) pueden compartir letras pero ningun token.
Reglas:
- Mismo string normalizado -> 1.0
- Jaccard de tokens == 0 (disjuntas) -> max 0.25 (forzamos DISCREPANCIA)
- Si tokens se sobreponen -> promedio 50/50 Jaccard + SequenceMatcher
"""
na, nb = normalize_sucursal(a), normalize_sucursal(b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
ta, tb = set(na.split()), set(nb.split())
if not ta or not tb:
return 0.0
jaccard = len(ta & tb) / len(ta | tb)
seq = SequenceMatcher(None, na, nb).ratio()
if jaccard == 0:
# Sin ningun token en comun -> claramente sucursales distintas.
# Cap a 0.25 para que caiga en DISCREPANCIA. Conservamos algo de seq
# para distinguir errores tipograficos cercanos de errores burdos.
return min(0.25, seq * 0.3)
return 0.5 * jaccard + 0.5 * seq
# --- Clasificacion de origen del contacto ------------------------------------
# Regla (priorizada de mas a menos confiable):
# 1. Si tiene submission en form_submissions + tag 'formulario' -> FORMULARIO (evidencia dura)
# 2. Si tiene tag 'formulario' (sin submission registrado) -> FORMULARIO_SIN_RASTRO
# 3. Si tiene tag 'sucursal' (sin tag 'formulario') -> SUCURSAL
# 4. Si source nativo contiene 'web user' -> SUCURSAL (confirma sucursal)
# 5. Si source contiene facebook/instagram/ads/google/lead -> DIGITAL
# 6. Si source contiene 'integration' -> INTEGRATION (probable reajuste)
# 7. Sin senales claras -> DESCONOCIDO
DIGITAL_KEYWORDS = ("facebook", "instagram", "google", "tiktok", "youtube",
"ads", "lead", "landing", "messenger", "linkedin")
def classify_origin(contact_tags, contact_source, has_form_submission):
tags_lower = {str(t).strip().lower() for t in (contact_tags or []) if t}
source_lower = str(contact_source or "").strip().lower()
has_form_tag = "formulario" in tags_lower
has_suc_tag = "sucursal" in tags_lower
if has_form_tag and has_form_submission:
return "FORMULARIO"
if has_form_tag:
return "FORMULARIO_SIN_RASTRO"
if has_suc_tag and "formulario" not in tags_lower:
return "SUCURSAL"
if "web user" in source_lower or "webuser" in source_lower:
return "SUCURSAL"
if "sucursal" in source_lower:
return "SUCURSAL"
if "form" in source_lower:
# source = "Formulario" / "Formulario - Sitio Web" sin tag formulario
return "FORMULARIO_SIN_RASTRO" if not has_form_submission else "FORMULARIO"
if any(k in source_lower for k in DIGITAL_KEYWORDS):
return "DIGITAL"
if "integration" in source_lower:
return "INTEGRATION"
return "DESCONOCIDO"
def origin_confidence(origin):
"""Cuanto pesa la evidencia que aporta el origen al juicio sobre Sucursal."""
return {
"FORMULARIO": "alta", # el cliente mismo eligio la sucursal
"FORMULARIO_SIN_RASTRO": "media",
"SUCURSAL": "alta", # se creo manualmente en esa sucursal
"DIGITAL": "media",
"INTEGRATION": "baja", # probable reajuste historico
"DESCONOCIDO": "baja",
}.get(origin, "baja")
def parse_tags(tags_value):
"""tags se guarda como JSON string en SQLite."""
if not tags_value:
return []
if isinstance(tags_value, list):
return tags_value
try:
parsed = json.loads(tags_value)
return parsed if isinstance(parsed, list) else []
except Exception:
return []
# --- Extraccion de Sucursal del contacto en Marca ----------------------------
SUCURSAL_CANONICAL = "sucursal"
def extract_custom_value(field):
for k in ("value", "fieldValueString", "fieldValueDate",
"fieldValueNumber", "fieldValueArray",
"fieldValueOptions", "fieldValueFile"):
v = field.get(k)
if v is None:
continue
if isinstance(v, str) and not v.strip():
continue
if isinstance(v, (list, dict)) and not v:
continue
return v
return None
def resolve_sucursal_field_id_from_db(location_id):
"""Lee object_schemas (poblada por el sync) para mapear nombre -> id sin
pegarle a GHL otra vez."""
conn = db.get_db_connection()
try:
rows = conn.execute(
"SELECT field_id, field_name FROM object_schemas WHERE location_id=? AND object_key='contact'",
(location_id,)
).fetchall()
for r in rows:
if strip_accents(r["field_name"]).lower().strip() == SUCURSAL_CANONICAL:
return r["field_id"]
return None
finally:
conn.close()
def get_contact_sucursal(contact_row, sucursal_field_id):
if not sucursal_field_id:
return None
try:
cfs = json.loads(contact_row.get("custom_fields_json") or "[]")
except Exception:
return None
if not isinstance(cfs, list):
return None
for f in cfs:
if not isinstance(f, dict):
continue
fid = f.get("id") or f.get("fieldId")
if fid == sucursal_field_id:
v = extract_custom_value(f)
if isinstance(v, list):
return ", ".join(str(x) for x in v if x is not None) or None
if v is not None:
return str(v).strip() or None
return None
# --- Buckets y reporte -------------------------------------------------------
def categorize(sim, ok_threshold, verify_threshold):
if sim >= ok_threshold:
return "OK"
if sim >= verify_threshold:
return "VERIFICAR"
return "DISCREPANCIA"
def latest_submission_per_contact(submissions):
"""Mantiene la submission mas reciente por contact_id (cuando trae sucursal)."""
by_contact = {}
for s in submissions:
cid = s.get("contact_id")
if not cid:
continue
if not s.get("sucursal_value"):
continue
prev = by_contact.get(cid)
if prev is None or (s.get("created_at") or "") > (prev.get("created_at") or ""):
by_contact[cid] = s
return by_contact
def main():
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
parser.add_argument("--location", default=BRAND_LOCATION_ID,
help=f"Location a auditar. Default: Marca ({BRAND_LOCATION_ID})")
parser.add_argument("--ok-threshold", type=float, default=0.60,
help="Similitud minima para considerar OK. Default 0.60")
parser.add_argument("--verify-threshold", type=float, default=0.30,
help="Similitud minima para 'verificar'. <esto = DISCREPANCIA. Default 0.30")
parser.add_argument("--show", choices=["discrepancia", "verificar", "all", "none"],
default="discrepancia",
help="Que detalle imprimir. Default: solo DISCREPANCIA.")
parser.add_argument("--filter-marca",
help="Filtra solo contactos cuya Sucursal en Marca contenga este texto (case-insensitive, sin acentos). Ej: 'queretaro'")
parser.add_argument("--origin",
help="Filtra el detalle a uno o varios origenes (coma-separados): "
"FORMULARIO, FORMULARIO_SIN_RASTRO, SUCURSAL, DIGITAL, INTEGRATION, DESCONOCIDO")
parser.add_argument("--xlsx", dest="xlsx_path", nargs="?", const="",
help="Exporta Excel. Sin argumento guarda en exports/ con timestamp.")
args = parser.parse_args()
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
if not (0 <= args.verify_threshold < args.ok_threshold <= 1):
raise SystemExit("Thresholds invalidos: 0 <= verify < ok <= 1")
location_id = args.location
# 1. submissions con sucursal (puede estar vacio, no es fatal: el audit
# entonces se basa solo en tags + source para clasificar origen)
submissions = db.get_form_submissions(location_id=location_id, with_sucursal_only=True)
latest_by_contact = latest_submission_per_contact(submissions)
print(f"Submissions con sucursal: {len(submissions)} "
f"(unicos por contacto, ultimo: {len(latest_by_contact)})")
# set de contact_ids con submission (para clasificar origen)
all_subs_any = db.get_form_submissions(location_id=location_id)
contact_ids_with_submission = {s["contact_id"] for s in all_subs_any if s.get("contact_id")}
# 2. contactos de la location
conn = db.get_db_connection()
contact_rows = conn.execute(
"SELECT * FROM contacts WHERE location_id=?", (location_id,)
).fetchall()
conn.close()
contacts_by_id = {r["id"]: dict(r) for r in contact_rows}
print(f"Contactos en {location_id}: {len(contacts_by_id)}")
# 3. field id de Sucursal
sucursal_fid = resolve_sucursal_field_id_from_db(location_id)
if not sucursal_fid:
raise SystemExit(f"No encontre el field 'Sucursal' en object_schemas para {location_id}. "
"Corre el sync global primero.")
# 4. comparar
filter_norm = strip_accents(args.filter_marca).lower() if args.filter_marca else None
buckets = {"OK": [], "VERIFICAR": [], "DISCREPANCIA": [],
"CONTACTO_SIN_SUCURSAL": [], "CONTACTO_NO_EN_DB": [],
"SIN_SUBMISSION": []} # contactos sin form submission, solo clasificados por origen
by_marca_sucursal = defaultdict(lambda: Counter())
origin_counter = Counter()
origin_x_bucket = defaultdict(Counter) # origin -> bucket -> count
def build_row(contact, cid, sub):
marca_val = get_contact_sucursal(contact, sucursal_fid)
tags = parse_tags(contact.get("tags"))
source = contact.get("source")
has_sub = cid in contact_ids_with_submission
origin = classify_origin(tags, source, has_sub)
return {
"contact_id": cid,
"name": (contact.get("first_name") or "") + " " + (contact.get("last_name") or ""),
"email": contact.get("email"),
"phone": contact.get("phone"),
"tags": tags,
"source": source,
"origin": origin,
"origin_confidence": origin_confidence(origin),
"marca_sucursal": marca_val,
"form_sucursal": (sub.get("sucursal_value") if sub else None) or None,
"similarity": None,
"bucket": None,
"submission_at": (sub.get("created_at") if sub else None),
"submission_id": (sub.get("id") if sub else None),
}
# 4a. Contactos CON submission: comparamos form vs marca
contacts_evaluated_via_form = set()
for cid, sub in latest_by_contact.items():
contact = contacts_by_id.get(cid)
form_val = sub.get("sucursal_value") or ""
if not contact:
buckets["CONTACTO_NO_EN_DB"].append({
"contact_id": cid,
"name": sub.get("name"),
"email": sub.get("email"),
"phone": sub.get("phone"),
"form_sucursal": form_val,
"submission_at": sub.get("created_at"),
"origin": "FORMULARIO",
"origin_confidence": "alta",
})
continue
row_data = build_row(contact, cid, sub)
if filter_norm and (not row_data["marca_sucursal"] or
filter_norm not in strip_accents(row_data["marca_sucursal"]).lower()):
continue
if not row_data["marca_sucursal"]:
row_data["bucket"] = "CONTACTO_SIN_SUCURSAL"
buckets["CONTACTO_SIN_SUCURSAL"].append(row_data)
origin_x_bucket[row_data["origin"]]["CONTACTO_SIN_SUCURSAL"] += 1
origin_counter[row_data["origin"]] += 1
contacts_evaluated_via_form.add(cid)
continue
sim = similarity(form_val, row_data["marca_sucursal"])
bucket = categorize(sim, args.ok_threshold, args.verify_threshold)
row_data["similarity"] = round(sim, 3)
row_data["bucket"] = bucket
buckets[bucket].append(row_data)
by_marca_sucursal[strip_accents(row_data["marca_sucursal"]).lower()][bucket] += 1
origin_x_bucket[row_data["origin"]][bucket] += 1
origin_counter[row_data["origin"]] += 1
contacts_evaluated_via_form.add(cid)
# 4b. Contactos SIN submission: solo clasificamos por origen (no hay form
# para comparar contra marca). Util para entender el universo total.
for cid, contact in contacts_by_id.items():
if cid in contacts_evaluated_via_form:
continue
row_data = build_row(contact, cid, None)
if filter_norm and (not row_data["marca_sucursal"] or
filter_norm not in strip_accents(row_data["marca_sucursal"]).lower()):
continue
row_data["bucket"] = "SIN_SUBMISSION"
buckets["SIN_SUBMISSION"].append(row_data)
origin_x_bucket[row_data["origin"]]["SIN_SUBMISSION"] += 1
origin_counter[row_data["origin"]] += 1
# 5. resumen
total_with_form = sum(len(buckets[k]) for k in ("OK", "VERIFICAR", "DISCREPANCIA",
"CONTACTO_SIN_SUCURSAL"))
total_overall = total_with_form + len(buckets["SIN_SUBMISSION"]) + len(buckets["CONTACTO_NO_EN_DB"])
print("\n" + "=" * 72)
print("RESUMEN (con submission del formulario = evidencia dura)")
print("=" * 72)
print(f"Contactos con submission y evaluados: {total_with_form}")
print(f" OK (>= {args.ok_threshold:.2f}): {len(buckets['OK'])}")
print(f" VERIFICAR ({args.verify_threshold:.2f}-{args.ok_threshold:.2f}): {len(buckets['VERIFICAR'])}")
print(f" DISCREPANCIA (< {args.verify_threshold:.2f}): {len(buckets['DISCREPANCIA'])}")
print(f" Contacto sin Sucursal en Marca: {len(buckets['CONTACTO_SIN_SUCURSAL'])}")
print(f"\nContactos SIN submission (clasificados solo por origen): {len(buckets['SIN_SUBMISSION'])}")
print(f"Submission sin contacto en DB: {len(buckets['CONTACTO_NO_EN_DB'])}")
print(f"\nTotal universo: {total_overall}")
# Desglose por origen probable
if origin_counter:
print("\n" + "-" * 72)
print("ORIGEN PROBABLE (tags + source nativo + submission)")
print("-" * 72)
for origin, n in origin_counter.most_common():
conf = origin_confidence(origin)
breakdown = origin_x_bucket[origin]
parts = [f"{b}={c}" for b, c in breakdown.most_common()]
print(f" {origin:24s} ({conf:5s} confianza): {n:4d} [{', '.join(parts)}]")
if by_marca_sucursal:
print("\n" + "-" * 72)
print("DESGLOSE POR SUCURSAL EN MARCA (solo contactos con submission)")
print("-" * 72)
for marca_suc, cnt in sorted(by_marca_sucursal.items(), key=lambda x: -sum(x[1].values())):
total_suc = sum(cnt.values())
print(f" {marca_suc!r}: {total_suc} "
f"(OK={cnt.get('OK',0)}, VERIF={cnt.get('VERIFICAR',0)}, DISCREP={cnt.get('DISCREPANCIA',0)})")
# 6. detalle
sections = []
if args.show == "all":
sections = ["DISCREPANCIA", "VERIFICAR", "OK", "CONTACTO_SIN_SUCURSAL",
"SIN_SUBMISSION", "CONTACTO_NO_EN_DB"]
elif args.show == "discrepancia":
sections = ["DISCREPANCIA"]
elif args.show == "verificar":
sections = ["DISCREPANCIA", "VERIFICAR"]
origin_filter = set(o.strip().upper() for o in (args.origin or "").split(",") if o.strip())
for sec in sections:
rows = buckets[sec]
if origin_filter:
rows = [r for r in rows if (r.get("origin") or "").upper() in origin_filter]
if not rows:
continue
print("\n" + "-" * 72)
print(f"{sec} ({len(rows)})")
print("-" * 72)
for r in rows:
name = (r.get("name") or "").strip() or "(sin nombre)"
sim_str = f"sim={r['similarity']:.2f}" if r.get("similarity") is not None else "sim=N/A"
origin = r.get("origin", "?")
conf = r.get("origin_confidence", "?")
print(f" {name} [{sim_str}] origen={origin} ({conf})")
print(f" contact_id: {r['contact_id']} phone={r.get('phone')!r} email={r.get('email')!r}")
print(f" tags={r.get('tags')!r} source={r.get('source')!r}")
print(f" Marca dice: {r.get('marca_sucursal')!r}")
if r.get("form_sucursal"):
print(f" Form dijo: {r['form_sucursal']!r} (submission_at={r.get('submission_at')})")
else:
print(f" Form: (sin submission registrado)")
# 7. xlsx opcional
if args.xlsx_path is not None:
try:
from openpyxl import Workbook
except ImportError:
print("\nWARN: openpyxl no esta instalado; salteo --xlsx")
else:
os.makedirs(EXPORTS_DIR, exist_ok=True)
if args.xlsx_path:
path = args.xlsx_path
if not os.path.isabs(path) and not os.sep in path:
base, ext = os.path.splitext(path)
if not ext:
ext = ".xlsx"
path = os.path.join(EXPORTS_DIR, base + ext)
else:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = os.path.join(EXPORTS_DIR, f"sucursal_vs_form_{ts}.xlsx")
wb = Workbook()
wb.remove(wb.active)
header = ["bucket", "similarity", "origin", "origin_confidence",
"contact_id", "name", "email", "phone",
"tags", "source", "marca_sucursal", "form_sucursal",
"submission_at", "submission_id"]
for sec in ["DISCREPANCIA", "VERIFICAR", "OK", "CONTACTO_SIN_SUCURSAL",
"SIN_SUBMISSION", "CONTACTO_NO_EN_DB"]:
ws = wb.create_sheet(sec[:31])
ws.append(header)
for r in buckets[sec]:
ws.append([
r.get("bucket"), r.get("similarity"),
r.get("origin"), r.get("origin_confidence"),
r.get("contact_id"), r.get("name"),
r.get("email"), r.get("phone"),
",".join(r.get("tags") or []) if isinstance(r.get("tags"), list) else r.get("tags"),
r.get("source"),
r.get("marca_sucursal"), r.get("form_sucursal"),
r.get("submission_at"), r.get("submission_id"),
])
wb.save(path)
print(f"\nExcel exportado: {path}")
if __name__ == "__main__":
main()