Primer commit

This commit is contained in:
2026-05-30 14:31:19 -06:00
commit a35d26fac0
277 changed files with 265240 additions and 0 deletions
+685
View File
@@ -0,0 +1,685 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""find_cross_branch_duplicates.py
Detecta contactos que aparecen en 2 o mas sucursales distintas (anomalia
cross-branch). El criterio primario es coincidencia por telefono normalizado;
el secundario es coincidencia por email normalizado. Despues de armar los
grupos sospechosos, hace doble check contra la cuenta de Marca principal para
indicar cuantas copias del mismo contacto existen alli.
Lectura 100% read-only desde mp_manager.sqlite. Requiere una sincronizacion
previa desde el dashboard.
Uso:
python scripts/find_cross_branch_duplicates.py
python scripts/find_cross_branch_duplicates.py --xlsx duplicados.xlsx
python scripts/find_cross_branch_duplicates.py --json duplicados.json --top 50
python scripts/find_cross_branch_duplicates.py --match phone
"""
import argparse
import json
import os
import sqlite3
import sys
from collections import defaultdict
from datetime import datetime
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import get_column_letter
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
from common import ( # noqa: E402
DB_PATH,
REPORT_DUPLICADOS,
match_contacts,
normalize_email,
normalize_phone,
)
BRAND_LOCATION_ID = "GbKkBpCmKu2QmloKFHy3"
MATCH_THRESHOLD = 0.80
def resolve_export_path(user_path, default_basename, extension):
"""Si el usuario pasa solo un nombre, lo guarda en REPORT_DUPLICADOS con timestamp.
Si pasa una ruta absoluta o relativa con carpeta, respeta esa ruta."""
if user_path and (os.path.isabs(user_path) or os.sep in user_path or "/" in user_path):
return user_path, False
os.makedirs(REPORT_DUPLICADOS, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
if user_path:
base, ext = os.path.splitext(os.path.basename(user_path))
ext = ext or extension
filename = f"{base}_{ts}{ext}"
else:
filename = f"{default_basename}_{ts}{extension}"
return os.path.join(REPORT_DUPLICADOS, filename), True
def safe_print(*args, **kwargs):
sep = kwargs.get("sep", " ")
end = kwargs.get("end", "\n")
text = sep.join(str(a) for a in args)
encoding = sys.stdout.encoding or "utf-8"
try:
sys.stdout.write(text + end)
sys.stdout.flush()
except UnicodeEncodeError:
sys.stdout.write(text.encode(encoding, errors="replace").decode(encoding) + end)
sys.stdout.flush()
def display_name(row):
first = (row["first_name"] or "").strip()
last = (row["last_name"] or "").strip()
full = (first + " " + last).strip()
return full or "(sin nombre)"
class UnionFind:
def __init__(self):
self.parent = {}
def find(self, key):
self.parent.setdefault(key, key)
while self.parent[key] != key:
self.parent[key] = self.parent[self.parent[key]]
key = self.parent[key]
return key
def union(self, a, b):
ra, rb = self.find(a), self.find(b)
if ra != rb:
self.parent[rb] = ra
def load_contacts():
if not os.path.exists(DB_PATH):
raise SystemExit(
f"No se encontro la base local en {DB_PATH}. "
"Ejecuta una sincronizacion desde el dashboard antes de correr este script."
)
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
try:
accounts = {
row["location_id"]: dict(row)
for row in conn.execute(
"SELECT location_id, nombre, type FROM accounts"
).fetchall()
}
contacts = conn.execute(
"SELECT id, location_id, first_name, last_name, email, phone "
"FROM contacts"
).fetchall()
return accounts, [dict(row) for row in contacts]
finally:
conn.close()
def index_brand(brand_contacts):
by_phone = defaultdict(list)
by_email = defaultdict(list)
for contact in brand_contacts:
phone = normalize_phone(contact["phone"])
if phone:
by_phone[phone].append(contact)
email = normalize_email(contact["email"])
if email:
by_email[email].append(contact)
return by_phone, by_email
def build_groups(branch_contacts, match_modes, threshold=MATCH_THRESHOLD):
"""Agrupa contactos cross-branch que parecen ser la misma persona.
Cambio respecto a la versión anterior: el matching por teléfono ahora
requiere también coincidencia de nombre (vía match_contacts en common.py).
Si dos contactos comparten teléfono pero el nombre diverge (caso pareja
con mismo número), se reportan en `unmatched_phone_collisions` para
revisión manual en vez de unirse silenciosamente.
Devuelve (groups, unmatched_phone_collisions).
"""
uf = UnionFind()
phone_index = defaultdict(list)
email_index = defaultdict(list)
for idx, contact in enumerate(branch_contacts):
if "phone" in match_modes:
phone = normalize_phone(contact["phone"])
if phone:
phone_index[phone].append(idx)
if "email" in match_modes:
email = normalize_email(contact["email"])
if email:
email_index[email].append(idx)
unmatched_phone_collisions = []
seen_collisions = set()
# Pares (idx_a, idx_b) que comparten phone pero divergen en nombre. NO
# deben unirse luego vía email (suele indicar datos confundidos por la
# integración, no la misma persona).
collision_pairs = set()
# Phone: validar cada par con match_contacts antes de unir.
for phone, indices in phone_index.items():
if len(indices) < 2:
continue
for i in range(len(indices)):
for j in range(i + 1, len(indices)):
idx_a, idx_b = indices[i], indices[j]
a, b = branch_contacts[idx_a], branch_contacts[idx_b]
# Saltar pares dentro de la misma location (mismo contacto local).
if a["location_id"] == b["location_id"]:
continue
result = match_contacts(a, b, threshold=threshold)
if result["level"] != "none":
uf.union(idx_a, idx_b)
elif "phone_collision_unresolved" in result["reasons"]:
pair_key = tuple(sorted((a["id"], b["id"])))
collision_pairs.add(tuple(sorted((idx_a, idx_b))))
if pair_key in seen_collisions:
continue
seen_collisions.add(pair_key)
unmatched_phone_collisions.append({
"phone": phone,
"a": a,
"b": b,
"name_score": result["name_score"],
})
# Email: compartir email exacto es señal fuerte (raro entre personas
# distintas). Se preserva el comportamiento previo de unir, EXCEPTO
# cuando el par ya fue marcado como colisión por phone — en ese caso el
# nombre divergente invalida el grupo aunque coincidan email y teléfono.
for indices in email_index.values():
if len(indices) < 2:
continue
for i in range(len(indices)):
for j in range(i + 1, len(indices)):
pair = tuple(sorted((indices[i], indices[j])))
if pair in collision_pairs:
continue
uf.union(indices[i], indices[j])
components = defaultdict(list)
for idx in range(len(branch_contacts)):
if idx in uf.parent:
components[uf.find(idx)].append(idx)
groups = []
for member_indices in components.values():
if len(member_indices) < 2:
continue
members = [branch_contacts[i] for i in member_indices]
locations = {m["location_id"] for m in members}
if len(locations) < 2:
continue
phones = sorted({normalize_phone(m["phone"]) for m in members if normalize_phone(m["phone"])})
emails = sorted({normalize_email(m["email"]) for m in members if normalize_email(m["email"])})
match_reasons = []
if phones and "phone" in match_modes:
match_reasons.append("telefono")
if emails and "email" in match_modes:
match_reasons.append("email")
groups.append({
"members": members,
"locations": locations,
"phones": phones,
"emails": emails,
"match_reasons": match_reasons,
})
groups.sort(
key=lambda g: (len(g["locations"]), len(g["members"])),
reverse=True,
)
return groups, unmatched_phone_collisions
def brand_check(group, brand_by_phone, brand_by_email):
seen_ids = set()
matches = []
for phone in group["phones"]:
for contact in brand_by_phone.get(phone, []):
if contact["id"] not in seen_ids:
seen_ids.add(contact["id"])
matches.append(contact)
for email in group["emails"]:
for contact in brand_by_email.get(email, []):
if contact["id"] not in seen_ids:
seen_ids.add(contact["id"])
matches.append(contact)
return matches
def print_group(idx, group, brand_matches, accounts):
locations = group["locations"]
safe_print(
f"\n[{idx}] {len(locations)} sucursales | {len(group['members'])} contactos "
f"| coincidencia por: {', '.join(group['match_reasons'])}"
)
if group["phones"]:
safe_print(f" Telefono(s): {', '.join(group['phones'])}")
if group["emails"]:
safe_print(f" Email(s): {', '.join(group['emails'])}")
for member in group["members"]:
loc_name = accounts.get(member["location_id"], {}).get("nombre") or member["location_id"]
safe_print(
f" - {loc_name} ({member['location_id']})\n"
f" contact_id: {member['id']}\n"
f" nombre: {display_name(member)}\n"
f" telefono: {member['phone'] or '(vacio)'}\n"
f" email: {member['email'] or '(vacio)'}"
)
if brand_matches is None:
return
if not brand_matches:
safe_print(" Doble check en Marca: NO existe contacto coincidente (sospechoso).")
return
label = "Marca" if len(brand_matches) == 1 else f"Marca (multiples: {len(brand_matches)})"
safe_print(f" Doble check en {label}:")
for contact in brand_matches:
safe_print(
f" - contact_id: {contact['id']} | nombre: {display_name(contact)} "
f"| telefono: {contact['phone'] or '(vacio)'} | email: {contact['email'] or '(vacio)'}"
)
def export_xlsx(path, groups, brand_results, accounts, collisions=None):
wb = Workbook()
ws = wb.active
ws.title = "Duplicados"
headers = [
"grupo", "sucursales", "miembros", "match", "phone_keys", "email_keys",
"location_id", "location_nombre", "contact_id", "nombre", "telefono", "email",
"es_marca", "marca_matches",
]
ws.append(headers)
header_font = Font(bold=True, color="FFFFFF")
header_fill = PatternFill("solid", fgColor="305496")
brand_fill = PatternFill("solid", fgColor="FFF2CC")
for col_idx, _ in enumerate(headers, start=1):
cell = ws.cell(row=1, column=col_idx)
cell.font = header_font
cell.fill = header_fill
for idx, (group, brand_matches) in enumerate(zip(groups, brand_results), start=1):
rows = []
for member in group["members"]:
rows.append((
[
idx,
len(group["locations"]),
len(group["members"]),
"+".join(group["match_reasons"]),
";".join(group["phones"]),
";".join(group["emails"]),
member["location_id"],
accounts.get(member["location_id"], {}).get("nombre") or "",
member["id"],
display_name(member),
member["phone"] or "",
member["email"] or "",
"no",
len(brand_matches) if brand_matches is not None else "",
],
False,
))
if brand_matches:
for contact in brand_matches:
rows.append((
[
idx,
len(group["locations"]),
len(group["members"]),
"+".join(group["match_reasons"]),
";".join(group["phones"]),
";".join(group["emails"]),
contact["location_id"],
accounts.get(contact["location_id"], {}).get("nombre") or "",
contact["id"],
display_name(contact),
contact["phone"] or "",
contact["email"] or "",
"si",
len(brand_matches),
],
True,
))
for row_data, is_brand in rows:
ws.append(row_data)
if is_brand:
for col_idx in range(1, len(headers) + 1):
ws.cell(row=ws.max_row, column=col_idx).fill = brand_fill
widths = [7, 12, 10, 14, 30, 30, 24, 28, 24, 28, 18, 32, 9, 14]
for col_idx, width in enumerate(widths, start=1):
ws.column_dimensions[get_column_letter(col_idx)].width = width
ws.freeze_panes = "A2"
ws.auto_filter.ref = ws.dimensions
# Segunda hoja: colisiones de teléfono que no matchean por nombre.
# Estos pares comparten teléfono normalizado pero el nombre diverge
# (caso "pareja con mismo número"). NO se tratan como duplicados.
if collisions:
ws2 = wb.create_sheet("colisiones_phone_sin_match")
collision_headers = [
"telefono", "name_score",
"location_id_a", "location_nombre_a", "contact_id_a",
"nombre_a", "email_a",
"location_id_b", "location_nombre_b", "contact_id_b",
"nombre_b", "email_b",
]
ws2.append(collision_headers)
for col_idx in range(1, len(collision_headers) + 1):
cell = ws2.cell(row=1, column=col_idx)
cell.font = header_font
cell.fill = header_fill
for item in collisions:
a, b = item["a"], item["b"]
ws2.append([
item["phone"],
round(float(item["name_score"]), 3),
a["location_id"],
accounts.get(a["location_id"], {}).get("nombre") or "",
a["id"],
display_name(a),
a["email"] or "",
b["location_id"],
accounts.get(b["location_id"], {}).get("nombre") or "",
b["id"],
display_name(b),
b["email"] or "",
])
widths2 = [14, 11, 24, 28, 24, 28, 32, 24, 28, 24, 28, 32]
for col_idx, width in enumerate(widths2, start=1):
ws2.column_dimensions[get_column_letter(col_idx)].width = width
ws2.freeze_panes = "A2"
ws2.auto_filter.ref = ws2.dimensions
wb.save(path)
def export_json(path, groups, brand_results, accounts, collisions=None):
grupos = []
for group, brand_matches in zip(groups, brand_results):
grupos.append({
"sucursales_count": len(group["locations"]),
"contactos_count": len(group["members"]),
"match_reasons": group["match_reasons"],
"phone_keys": group["phones"],
"email_keys": group["emails"],
"branch_members": [
{
"location_id": m["location_id"],
"location_nombre": accounts.get(m["location_id"], {}).get("nombre") or "",
"contact_id": m["id"],
"nombre": display_name(m),
"telefono": m["phone"] or "",
"email": m["email"] or "",
}
for m in group["members"]
],
"brand_matches": [
{
"contact_id": c["id"],
"nombre": display_name(c),
"telefono": c["phone"] or "",
"email": c["email"] or "",
}
for c in (brand_matches or [])
] if brand_matches is not None else None,
})
payload = {
"grupos": grupos,
"colisiones_phone_sin_match": [
{
"telefono": item["phone"],
"name_score": round(float(item["name_score"]), 3),
"a": {
"location_id": item["a"]["location_id"],
"location_nombre": accounts.get(item["a"]["location_id"], {}).get("nombre") or "",
"contact_id": item["a"]["id"],
"nombre": display_name(item["a"]),
"email": item["a"]["email"] or "",
},
"b": {
"location_id": item["b"]["location_id"],
"location_nombre": accounts.get(item["b"]["location_id"], {}).get("nombre") or "",
"contact_id": item["b"]["id"],
"nombre": display_name(item["b"]),
"email": item["b"]["email"] or "",
},
}
for item in (collisions or [])
],
}
with open(path, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2)
def print_overview(groups, brand_results, accounts, branch_contacts, brand_contacts, match_modes, no_brand_check, phone_collisions=None):
total_groups = len(groups)
branch_total = len(branch_contacts)
brand_total = len(brand_contacts)
duplicated_contacts = sum(len(g["members"]) for g in groups)
locations_in_dupes = set()
by_branch_dup_contacts = defaultdict(int)
by_branch_dup_groups = defaultdict(set)
locations_size_distribution = defaultdict(int)
contacts_size_distribution = defaultdict(int)
only_phone = only_email = both = 0
for g_idx, group in enumerate(groups):
locations_size_distribution[len(group["locations"])] += 1
contacts_size_distribution[len(group["members"])] += 1
reasons = set(group["match_reasons"])
if reasons == {"telefono"}:
only_phone += 1
elif reasons == {"email"}:
only_email += 1
elif "telefono" in reasons and "email" in reasons:
both += 1
for member in group["members"]:
loc = member["location_id"]
locations_in_dupes.add(loc)
by_branch_dup_contacts[loc] += 1
by_branch_dup_groups[loc].add(g_idx)
brand_copies_total = 0
sin_marca = una_en_marca = multi_marca = 0
if not no_brand_check:
for r in brand_results:
if r is None:
continue
brand_copies_total += len(r)
if not r:
sin_marca += 1
elif len(r) == 1:
una_en_marca += 1
else:
multi_marca += 1
safe_print("\n" + "=" * 78)
safe_print("OVERVIEW FINAL")
safe_print("=" * 78)
safe_print(f"Sucursales auditadas: {len({c['location_id'] for c in branch_contacts})}")
safe_print(f"Sucursales involucradas en duplicados: {len(locations_in_dupes)}")
safe_print(f"Contactos totales en sucursales: {branch_total}")
safe_print(f"Contactos totales en Marca: {brand_total}")
safe_print(f"Criterios de match aplicados: {', '.join(sorted(match_modes))}")
safe_print("")
safe_print(f"Grupos de duplicados detectados: {total_groups}")
safe_print(f"Contactos duplicados (en sucursales): {duplicated_contacts}")
if branch_total:
pct = (duplicated_contacts / branch_total) * 100
safe_print(f"% de contactos en sucursales duplicados: {pct:.2f}%")
safe_print("")
safe_print("Distribucion por motivo de coincidencia:")
safe_print(f" - solo por telefono: {only_phone}")
safe_print(f" - solo por email: {only_email}")
safe_print(f" - por telefono y email (ambos): {both}")
if locations_size_distribution:
safe_print("")
safe_print("Distribucion por # de sucursales involucradas:")
for size in sorted(locations_size_distribution):
safe_print(f" - {size} sucursales: {locations_size_distribution[size]} grupos")
if contacts_size_distribution:
safe_print("")
safe_print("Distribucion por # de contactos por grupo:")
for size in sorted(contacts_size_distribution):
safe_print(f" - {size} contactos: {contacts_size_distribution[size]} grupos")
if not no_brand_check:
safe_print("")
safe_print("Doble check contra Marca:")
safe_print(f" - grupos sin contraparte en Marca: {sin_marca}")
safe_print(f" - grupos con 1 contacto en Marca: {una_en_marca}")
safe_print(f" - grupos con multiples copias en Marca: {multi_marca}")
safe_print(f" - contactos sumados en Marca (anomalias): {brand_copies_total}")
if by_branch_dup_contacts:
ranking = sorted(
by_branch_dup_contacts.items(),
key=lambda kv: (kv[1], len(by_branch_dup_groups[kv[0]])),
reverse=True,
)
safe_print("")
safe_print("Sucursales con mas contactos en duplicados (top 10):")
for loc_id, count in ranking[:10]:
nombre = accounts.get(loc_id, {}).get("nombre") or loc_id
grupos = len(by_branch_dup_groups[loc_id])
safe_print(f" - {nombre} ({loc_id}): {count} contactos en {grupos} grupos")
if phone_collisions:
safe_print("")
safe_print("Colisiones de telefono sin match por nombre (revisión manual):")
safe_print(f" - pares detectados: {len(phone_collisions)}")
safe_print(" Ver hoja 'colisiones_phone_sin_match' del XLSX para detalle.")
def parse_args():
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
parser.add_argument(
"--match", default="phone,email",
help="Criterios de match: 'phone', 'email' o 'phone,email' (default).",
)
parser.add_argument(
"--no-brand-check", action="store_true",
help="Omite la verificacion cruzada contra la cuenta de Marca.",
)
parser.add_argument(
"--top", type=int, default=30,
help="Cantidad de grupos a imprimir en consola (default 30). Usa 0 para imprimir todos.",
)
parser.add_argument(
"--xlsx", dest="xlsx_path", nargs="?", const="",
help="Exporta Excel (.xlsx). Sin argumento guarda en exports/ con timestamp; con ruta absoluta usa esa ruta.",
)
parser.add_argument(
"--json", dest="json_path", nargs="?", const="",
help="Exporta JSON. Sin argumento guarda en exports/ con timestamp; con ruta absoluta usa esa ruta.",
)
return parser.parse_args()
def main():
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
args = parse_args()
match_modes = {m.strip().lower() for m in args.match.split(",") if m.strip()}
invalid = match_modes - {"phone", "email"}
if invalid or not match_modes:
raise SystemExit(f"Criterios de --match invalidos: {invalid or 'vacio'}")
accounts, contacts = load_contacts()
branch_contacts = [c for c in contacts if c["location_id"] != BRAND_LOCATION_ID]
brand_contacts = [c for c in contacts if c["location_id"] == BRAND_LOCATION_ID]
branch_locations = {c["location_id"] for c in branch_contacts}
safe_print("=" * 78)
safe_print("AUDIT: CONTACTOS DUPLICADOS ENTRE SUCURSALES")
safe_print("=" * 78)
safe_print(f"Sucursales con contactos: {len(branch_locations)}")
safe_print(f"Contactos en sucursales: {len(branch_contacts)}")
safe_print(f"Contactos en Marca: {len(brand_contacts)}")
safe_print(f"Criterios de match: {', '.join(sorted(match_modes))}")
sin_phone = sum(1 for c in branch_contacts if not normalize_phone(c["phone"]))
sin_email = sum(1 for c in branch_contacts if not normalize_email(c["email"]))
safe_print(f"Sin telefono normalizable: {sin_phone}")
safe_print(f"Sin email normalizable: {sin_email}")
groups, phone_collisions = build_groups(branch_contacts, match_modes)
safe_print(f"\nGrupos duplicados detectados (>=2 sucursales): {len(groups)}")
if "phone" in match_modes:
safe_print(
f"Colisiones de telefono sin match (revisión manual): {len(phone_collisions)}"
)
brand_by_phone, brand_by_email = ({}, {})
if not args.no_brand_check:
brand_by_phone, brand_by_email = index_brand(brand_contacts)
brand_results = []
for group in groups:
if args.no_brand_check:
brand_results.append(None)
else:
brand_results.append(brand_check(group, brand_by_phone, brand_by_email))
# Bucket counts para resumen.
sin_marca = sum(1 for r in brand_results if r is not None and not r)
una_en_marca = sum(1 for r in brand_results if r is not None and len(r) == 1)
multi_marca = sum(1 for r in brand_results if r is not None and len(r) > 1)
if not args.no_brand_check:
safe_print(f" - sin contraparte en Marca: {sin_marca}")
safe_print(f" - con 1 contacto en Marca: {una_en_marca}")
safe_print(f" - con multiples copias en Marca: {multi_marca}")
limit = len(groups) if args.top <= 0 else min(args.top, len(groups))
if limit:
safe_print("\n" + "-" * 78)
safe_print(f"DETALLE (top {limit} grupos por # sucursales involucradas)")
safe_print("-" * 78)
for i in range(limit):
print_group(i + 1, groups[i], brand_results[i], accounts)
if args.xlsx_path is not None:
xlsx_path, in_exports = resolve_export_path(args.xlsx_path, "duplicados_sucursales", ".xlsx")
export_xlsx(xlsx_path, groups, brand_results, accounts, phone_collisions)
safe_print(f"\nExcel exportado en: {xlsx_path}")
if in_exports:
safe_print(f"[DOWNLOAD] /api/exports/{os.path.basename(xlsx_path)}")
if args.json_path is not None:
json_path, in_exports = resolve_export_path(args.json_path, "duplicados_sucursales", ".json")
export_json(json_path, groups, brand_results, accounts, phone_collisions)
safe_print(f"JSON exportado en: {json_path}")
if in_exports:
safe_print(f"[DOWNLOAD] /api/exports/{os.path.basename(json_path)}")
print_overview(
groups, brand_results, accounts,
branch_contacts, brand_contacts, match_modes, args.no_brand_check,
phone_collisions,
)
if __name__ == "__main__":
main()