Primer commit

2026-05-30 14:31:19 -06:00
commit a35d26fac0
277 changed files with 265240 additions and 0 deletions
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+contact_classifier.py
+
+Modulo compartido que detecta contactos de test/prueba/E3 a partir de
+first_name, last_name, email y tags. Es la fuente de verdad de los
+patrones; tanto el script `scripts/find_test_contacts.py` como el endpoint
+`/api/contacts/{location_id}` lo usan para mantener consistencia entre el
+reporte CLI y la marca visual del dashboard.
+
+Funciones publicas:
+  - classify_contact(first_name, last_name, email, tags_list) -> (reasons, classes)
+  - is_test_contact(contact_dict) -> bool
+  - annotate_contact(contact_dict) -> dict   (agrega is_test y test_reasons)
+"""
+
+import json
+import re
+import unicodedata
+
+
+# Patrones (mismos del script find_test_contacts.py)
+SPANISH_PATTERNS = [
+    r'prueba', r'pruebas', r'probando', r'testeo', r'ejemplo', r'ejemplos',
+    r'demostracion', r'borrador', r'probar', r'pruebita'
+]
+
+ENGLISH_PATTERNS = [
+    r'test', r'testing', r'tester', r'tests', r'dummy', r'fake', r'mock',
+    r'example', r'examples', r'demo', r'trial', r'sandbox'
+]
+
+E3_PATTERNS = [
+    r'e3', r'e-3'
+]
+
+GENERIC_PATTERNS = [
+    r'temporal', r'temp', r'generico', r'sdasd', r'asdasd', r'qwerty', r'12345',
+    r'testea', r'ficticio'
+]
+
+# Correos conocidos del equipo E3 que siempre se consideran test, aunque el
+# nombre/etiquetas no disparen ningun patron.
+TEST_EMAILS = {
+    "servandobra@gmail.com",
+    "uriel.conse3@gmail.com",
+}
+
+
+def normalize_text(text):
+    if not text:
+        return ""
+    nfkd = unicodedata.normalize("NFD", str(text))
+    clean = "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
+    return " ".join(clean.lower().split())
+
+
+def _word_match(pattern, text):
+    """
+    True si `pattern` aparece como palabra completa en `text` (limites \\b).
+
+    Evita falsos positivos por subcadena: p.ej. 'temp' NO debe matchear dentro
+    de 'tlatempa', ni 'test' dentro de 'protesta'. Una palabra solo cuenta si
+    esta delimitada por inicio/fin de cadena, espacios o puntuacion (incluido
+    el '-' de tags como 'qa-test' o el '@' de un email 'test@...').
+    """
+    if not text:
+        return False
+    return re.search(rf'\b{re.escape(pattern)}\b', text, re.IGNORECASE) is not None
+
+
+def classify_contact(first_name, last_name, email, tags_list):
+    """
+    Devuelve (reasons, classifications). Si reasons es vacio, el contacto NO es de prueba.
+    """
+    first_name_lower = (first_name or "").lower()
+    last_name_lower = (last_name or "").lower()
+    email_lower = (email or "").lower()
+    tags_lower = [t.lower() for t in tags_list] if tags_list else []
+
+    reasons = []
+    classifications = set()
+
+    # 0. Correos conocidos del equipo E3 (match exacto).
+    if email_lower in TEST_EMAILS:
+        reasons.append(f"Email en lista E3 ({email_lower})")
+        classifications.add("E3")
+
+    # 1. E3 (regex con limites para evitar falsos positivos)
+    for p in E3_PATTERNS:
+        pattern_regex = re.compile(rf'\b{p}\b|{p}[-_]|[-_]{p}', re.IGNORECASE)
+        if pattern_regex.search(first_name_lower) or pattern_regex.search(last_name_lower):
+            reasons.append(f"Nombre contiene '{p.upper()}'")
+            classifications.add("E3")
+        elif p in first_name_lower or p in last_name_lower:
+            reasons.append(f"Nombre contiene subcadena '{p.upper()}'")
+            classifications.add("E3")
+
+        email_pattern = re.compile(rf'\b{p}\b|^{p}|{p}@|{p}[-_]|[-_]{p}', re.IGNORECASE)
+        if email_pattern.search(email_lower):
+            reasons.append(f"Email contiene '{p.upper()}'")
+            classifications.add("E3")
+
+        if any(p in t for t in tags_lower):
+            reasons.append(f"Etiqueta contiene '{p.upper()}'")
+            classifications.add("E3")
+
+    # 2. Espanol (palabra completa para evitar falsos positivos por subcadena)
+    for p in SPANISH_PATTERNS:
+        if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
+            reasons.append(f"Nombre contiene '{p}'")
+            classifications.add("Test/Prueba Espanol")
+        if _word_match(p, email_lower):
+            reasons.append(f"Email contiene '{p}'")
+            classifications.add("Test/Prueba Espanol")
+        if any(_word_match(p, t) for t in tags_lower):
+            reasons.append(f"Etiqueta contiene '{p}'")
+            classifications.add("Test/Prueba Espanol")
+
+    # 3. Ingles (palabra completa para evitar falsos positivos por subcadena)
+    for p in ENGLISH_PATTERNS:
+        if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
+            reasons.append(f"Nombre contiene '{p}'")
+            classifications.add("Test/Prueba Ingles")
+        if _word_match(p, email_lower):
+            reasons.append(f"Email contiene '{p}'")
+            classifications.add("Test/Prueba Ingles")
+        if any(_word_match(p, t) for t in tags_lower):
+            reasons.append(f"Etiqueta contiene '{p}'")
+            classifications.add("Test/Prueba Ingles")
+
+    # 4. Generico (palabra completa: 'temp' no debe matchear dentro de 'tlatempa')
+    for p in GENERIC_PATTERNS:
+        if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
+            reasons.append(f"Nombre contiene temporal '{p}'")
+            classifications.add("Temporal/Generico")
+        if _word_match(p, email_lower):
+            reasons.append(f"Email contiene temporal '{p}'")
+            classifications.add("Temporal/Generico")
+        if any(_word_match(p, t) for t in tags_lower):
+            reasons.append(f"Etiqueta contiene temporal '{p}'")
+            classifications.add("Temporal/Generico")
+
+    # 5. Nombre o apellido puramente numerico
+    if (first_name_lower and first_name_lower.isdigit()) or (last_name_lower and last_name_lower.isdigit()):
+        reasons.append("Nombre o Apellido es numerico")
+        classifications.add("Temporal/Generico")
+
+    return sorted(list(set(reasons))), sorted(list(classifications))
+
+
+def _coerce_tags(raw):
+    """Acepta tags como lista, JSON string o None y devuelve lista de strings."""
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        return [str(t) for t in raw if t is not None]
+    if isinstance(raw, str):
+        try:
+            parsed = json.loads(raw)
+            if isinstance(parsed, list):
+                return [str(t) for t in parsed if t is not None]
+        except (ValueError, TypeError):
+            pass
+    return []
+
+
+def annotate_contact(contact):
+    """
+    Recibe un dict de contacto (como lo devuelve db.get_contacts) y le agrega
+    los campos `is_test` (bool) y `test_reasons` (lista de strings).
+    Devuelve el mismo dict mutado.
+    """
+    tags = _coerce_tags(contact.get("tags"))
+    reasons, _classes = classify_contact(
+        contact.get("first_name"),
+        contact.get("last_name"),
+        contact.get("email"),
+        tags,
+    )
+    contact["is_test"] = bool(reasons)
+    contact["test_reasons"] = reasons
+    return contact
+
+
+def is_test_contact(contact):
+    """Atajo: True/False sin mutar el dict."""
+    tags = _coerce_tags(contact.get("tags"))
+    reasons, _ = classify_contact(
+        contact.get("first_name"),
+        contact.get("last_name"),
+        contact.get("email"),
+        tags,
+    )
+    return bool(reasons)