197 lines
7.1 KiB
Python
197 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
contact_classifier.py
|
|
|
|
Modulo compartido que detecta contactos de test/prueba/E3 a partir de
|
|
first_name, last_name, email y tags. Es la fuente de verdad de los
|
|
patrones; tanto el script `scripts/find_test_contacts.py` como el endpoint
|
|
`/api/contacts/{location_id}` lo usan para mantener consistencia entre el
|
|
reporte CLI y la marca visual del dashboard.
|
|
|
|
Funciones publicas:
|
|
- classify_contact(first_name, last_name, email, tags_list) -> (reasons, classes)
|
|
- is_test_contact(contact_dict) -> bool
|
|
- annotate_contact(contact_dict) -> dict (agrega is_test y test_reasons)
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
|
|
|
|
# Patrones (mismos del script find_test_contacts.py)
|
|
SPANISH_PATTERNS = [
|
|
r'prueba', r'pruebas', r'probando', r'testeo', r'ejemplo', r'ejemplos',
|
|
r'demostracion', r'borrador', r'probar', r'pruebita'
|
|
]
|
|
|
|
ENGLISH_PATTERNS = [
|
|
r'test', r'testing', r'tester', r'tests', r'dummy', r'fake', r'mock',
|
|
r'example', r'examples', r'demo', r'trial', r'sandbox'
|
|
]
|
|
|
|
E3_PATTERNS = [
|
|
r'e3', r'e-3'
|
|
]
|
|
|
|
GENERIC_PATTERNS = [
|
|
r'temporal', r'temp', r'generico', r'sdasd', r'asdasd', r'qwerty', r'12345',
|
|
r'testea', r'ficticio'
|
|
]
|
|
|
|
# Correos conocidos del equipo E3 que siempre se consideran test, aunque el
|
|
# nombre/etiquetas no disparen ningun patron.
|
|
TEST_EMAILS = {
|
|
"servandobra@gmail.com",
|
|
"uriel.conse3@gmail.com",
|
|
}
|
|
|
|
|
|
def normalize_text(text):
|
|
if not text:
|
|
return ""
|
|
nfkd = unicodedata.normalize("NFD", str(text))
|
|
clean = "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
|
|
return " ".join(clean.lower().split())
|
|
|
|
|
|
def _word_match(pattern, text):
|
|
"""
|
|
True si `pattern` aparece como palabra completa en `text` (limites \\b).
|
|
|
|
Evita falsos positivos por subcadena: p.ej. 'temp' NO debe matchear dentro
|
|
de 'tlatempa', ni 'test' dentro de 'protesta'. Una palabra solo cuenta si
|
|
esta delimitada por inicio/fin de cadena, espacios o puntuacion (incluido
|
|
el '-' de tags como 'qa-test' o el '@' de un email 'test@...').
|
|
"""
|
|
if not text:
|
|
return False
|
|
return re.search(rf'\b{re.escape(pattern)}\b', text, re.IGNORECASE) is not None
|
|
|
|
|
|
def classify_contact(first_name, last_name, email, tags_list):
|
|
"""
|
|
Devuelve (reasons, classifications). Si reasons es vacio, el contacto NO es de prueba.
|
|
"""
|
|
first_name_lower = (first_name or "").lower()
|
|
last_name_lower = (last_name or "").lower()
|
|
email_lower = (email or "").lower()
|
|
tags_lower = [t.lower() for t in tags_list] if tags_list else []
|
|
|
|
reasons = []
|
|
classifications = set()
|
|
|
|
# 0. Correos conocidos del equipo E3 (match exacto).
|
|
if email_lower in TEST_EMAILS:
|
|
reasons.append(f"Email en lista E3 ({email_lower})")
|
|
classifications.add("E3")
|
|
|
|
# 1. E3 (regex con limites para evitar falsos positivos)
|
|
for p in E3_PATTERNS:
|
|
pattern_regex = re.compile(rf'\b{p}\b|{p}[-_]|[-_]{p}', re.IGNORECASE)
|
|
if pattern_regex.search(first_name_lower) or pattern_regex.search(last_name_lower):
|
|
reasons.append(f"Nombre contiene '{p.upper()}'")
|
|
classifications.add("E3")
|
|
elif p in first_name_lower or p in last_name_lower:
|
|
reasons.append(f"Nombre contiene subcadena '{p.upper()}'")
|
|
classifications.add("E3")
|
|
|
|
email_pattern = re.compile(rf'\b{p}\b|^{p}|{p}@|{p}[-_]|[-_]{p}', re.IGNORECASE)
|
|
if email_pattern.search(email_lower):
|
|
reasons.append(f"Email contiene '{p.upper()}'")
|
|
classifications.add("E3")
|
|
|
|
if any(p in t for t in tags_lower):
|
|
reasons.append(f"Etiqueta contiene '{p.upper()}'")
|
|
classifications.add("E3")
|
|
|
|
# 2. Espanol (palabra completa para evitar falsos positivos por subcadena)
|
|
for p in SPANISH_PATTERNS:
|
|
if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
|
|
reasons.append(f"Nombre contiene '{p}'")
|
|
classifications.add("Test/Prueba Espanol")
|
|
if _word_match(p, email_lower):
|
|
reasons.append(f"Email contiene '{p}'")
|
|
classifications.add("Test/Prueba Espanol")
|
|
if any(_word_match(p, t) for t in tags_lower):
|
|
reasons.append(f"Etiqueta contiene '{p}'")
|
|
classifications.add("Test/Prueba Espanol")
|
|
|
|
# 3. Ingles (palabra completa para evitar falsos positivos por subcadena)
|
|
for p in ENGLISH_PATTERNS:
|
|
if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
|
|
reasons.append(f"Nombre contiene '{p}'")
|
|
classifications.add("Test/Prueba Ingles")
|
|
if _word_match(p, email_lower):
|
|
reasons.append(f"Email contiene '{p}'")
|
|
classifications.add("Test/Prueba Ingles")
|
|
if any(_word_match(p, t) for t in tags_lower):
|
|
reasons.append(f"Etiqueta contiene '{p}'")
|
|
classifications.add("Test/Prueba Ingles")
|
|
|
|
# 4. Generico (palabra completa: 'temp' no debe matchear dentro de 'tlatempa')
|
|
for p in GENERIC_PATTERNS:
|
|
if _word_match(p, first_name_lower) or _word_match(p, last_name_lower):
|
|
reasons.append(f"Nombre contiene temporal '{p}'")
|
|
classifications.add("Temporal/Generico")
|
|
if _word_match(p, email_lower):
|
|
reasons.append(f"Email contiene temporal '{p}'")
|
|
classifications.add("Temporal/Generico")
|
|
if any(_word_match(p, t) for t in tags_lower):
|
|
reasons.append(f"Etiqueta contiene temporal '{p}'")
|
|
classifications.add("Temporal/Generico")
|
|
|
|
# 5. Nombre o apellido puramente numerico
|
|
if (first_name_lower and first_name_lower.isdigit()) or (last_name_lower and last_name_lower.isdigit()):
|
|
reasons.append("Nombre o Apellido es numerico")
|
|
classifications.add("Temporal/Generico")
|
|
|
|
return sorted(list(set(reasons))), sorted(list(classifications))
|
|
|
|
|
|
def _coerce_tags(raw):
|
|
"""Acepta tags como lista, JSON string o None y devuelve lista de strings."""
|
|
if raw is None:
|
|
return []
|
|
if isinstance(raw, list):
|
|
return [str(t) for t in raw if t is not None]
|
|
if isinstance(raw, str):
|
|
try:
|
|
parsed = json.loads(raw)
|
|
if isinstance(parsed, list):
|
|
return [str(t) for t in parsed if t is not None]
|
|
except (ValueError, TypeError):
|
|
pass
|
|
return []
|
|
|
|
|
|
def annotate_contact(contact):
|
|
"""
|
|
Recibe un dict de contacto (como lo devuelve db.get_contacts) y le agrega
|
|
los campos `is_test` (bool) y `test_reasons` (lista de strings).
|
|
Devuelve el mismo dict mutado.
|
|
"""
|
|
tags = _coerce_tags(contact.get("tags"))
|
|
reasons, _classes = classify_contact(
|
|
contact.get("first_name"),
|
|
contact.get("last_name"),
|
|
contact.get("email"),
|
|
tags,
|
|
)
|
|
contact["is_test"] = bool(reasons)
|
|
contact["test_reasons"] = reasons
|
|
return contact
|
|
|
|
|
|
def is_test_contact(contact):
|
|
"""Atajo: True/False sin mutar el dict."""
|
|
tags = _coerce_tags(contact.get("tags"))
|
|
reasons, _ = classify_contact(
|
|
contact.get("first_name"),
|
|
contact.get("last_name"),
|
|
contact.get("email"),
|
|
tags,
|
|
)
|
|
return bool(reasons)
|