html/api/scan-erp-gaps-rss.py

#!/usr/bin/env python3
"""
WEVAL — ERP Gap Scanner via RSS/ATOM feeds (Option C)
Subscribe à flux consulting/tech/vendor et extrait mentions des 25 ERPs avec keywords pain.
Stocke dans erp_gap_scans avec source_url = URL de l'article.

Sources publiques (no auth required):
- Reddit r/ERP (JSON API)
- CIO.com ERP tag
- TechRepublic ERP
- SAP/Oracle/Microsoft blog RSS (release notes mentioning issues)
- Gartner public blog
- G2 recently added (limited)

Doctrine #5 INSERT ON CONFLICT DO NOTHING.
Doctrine #4 honnêteté — tag source clairement.
"""
import sys, json, time, re, urllib.request, urllib.error, html
from datetime import datetime
import psycopg2

try:
    import feedparser
except ImportError:
    print("ERR: pip install feedparser", file=sys.stderr)
    sys.exit(1)

DB_CONFIG = dict(host="10.1.0.3", port=5432, dbname="adx_system", user="admin", password="admin123", connect_timeout=5)

# Publicly accessible RSS feeds (no auth)
RSS_FEEDS = [
    ("Reddit_r_ERP",       "https://www.reddit.com/r/ERP/.rss"),
    ("Reddit_r_sap",       "https://www.reddit.com/r/SAP/.rss"),
    ("Reddit_r_netsuite",  "https://www.reddit.com/r/netsuite/.rss"),
    ("Reddit_r_Dynamics365","https://www.reddit.com/r/Dynamics365/.rss"),
    ("Reddit_r_salesforce","https://www.reddit.com/r/salesforce/.rss"),
    ("Reddit_r_workday",   "https://www.reddit.com/r/Workday/.rss"),
    ("Reddit_r_Odoo",      "https://www.reddit.com/r/Odoo/.rss"),
    ("CIO_ERP",            "https://www.cio.com/feed/"),
    ("TechRepublic_Enterprise", "https://www.techrepublic.com/rssfeeds/topic/enterprise-software/"),
    ("ComputerWeekly_ERP", "https://www.computerweekly.com/rss/IT-for-transport-and-travel-industry.xml"),
    ("ITWorldCanada_ERP",  "https://www.itworldcanada.com/feed"),
    ("ERPToday",           "https://erp.today/feed/"),
    ("DiginomicaERP",      "https://diginomica.com/topic/erp/rss.xml"),
    ("CXToday",            "https://www.cxtoday.com/crm/feed/"),
]

# ERP name -> id mapping (for extraction)
ERP_MAP = {
    "SAP S/4HANA": "sap_s4hana", "S/4HANA": "sap_s4hana", "S/4 HANA": "sap_s4hana",
    "SAP Business One": "sap_b1", "SAP B1": "sap_b1", "Business One": "sap_b1",
    "Oracle E-Business": "oracle_ebs", "Oracle EBS": "oracle_ebs", "E-Business Suite": "oracle_ebs",
    "Oracle Fusion": "oracle_fusion", "Fusion Cloud": "oracle_fusion",
    "NetSuite": "oracle_netsuite",
    "Sage X3": "sage_x3",
    "Sage 100": "sage_100",
    "Sage Intacct": "sage_intacct", "Intacct": "sage_intacct",
    "Odoo": "odoo",
    "Dynamics 365 F&O": "ms_d365_fo", "D365 F&O": "ms_d365_fo", "Dynamics 365 Finance": "ms_d365_fo", "D365FO": "ms_d365_fo",
    "Dynamics 365 Business Central": "ms_d365_bc", "D365 BC": "ms_d365_bc", "Business Central": "ms_d365_bc",
    "Dynamics 365 Customer Engagement": "ms_d365_ce", "D365 CE": "ms_d365_ce", "Dynamics CRM": "ms_d365_ce",
    "Workday": "workday",
    "Salesforce": "salesforce",
    "Infor M3": "infor_m3",
    "Infor CloudSuite": "infor_cs", "Infor CS": "infor_cs",
    "IFS Cloud": "ifs", "IFS Applications": "ifs",
    "Epicor": "epicor", "Kinetic": "epicor",
    "QAD": "qad",
    "Acumatica": "acumatica",
    "Priority": "priority",
    "Deltek": "deltek", "Costpoint": "deltek",
    "ServiceNow": "servicenow",
    "Veeva": "veeva",
    "Temenos": "temenos",
}

# Keywords (English + French) for pain-detection
PAIN_KW = [
    "pain", "limitation", "limits", "issue", "problem", "bug", "slow", "crash",
    "complaint", "drawback", "weakness", "shortcoming", "bottleneck", "broken",
    "frustrating", "workaround", "manual", "difficult", "lacks", "missing",
    "challenge", "struggle", "outdated", "legacy", "expensive", "complex",
    # FR
    "lent", "manque", "limitation", "problème", "difficulté", "bogue", "bug",
    "archaïque", "obsolète", "manuel", "complexe", "difficile", "frustrant",
]

def normalize_entry(entry):
    title = html.unescape(entry.get("title", ""))
    summary = html.unescape(entry.get("summary", entry.get("description", "")))
    # Strip HTML tags
    summary = re.sub(r"<[^>]+>", " ", summary)
    summary = re.sub(r"\s+", " ", summary).strip()
    link = entry.get("link", "")
    return title, summary, link

def detect_erps_mentioned(text):
    """Return list of erp_id mentioned in text"""
    text_lc = text.lower()
    found = set()
    for alias, erp_id in ERP_MAP.items():
        if alias.lower() in text_lc:
            found.add(erp_id)
    return list(found)

def score_pain(text):
    """Return (score 0..1, matched keywords)"""
    text_lc = text.lower()
    matches = [kw for kw in PAIN_KW if kw in text_lc]
    # Normalize: 5+ kw = 1.0
    score = min(1.0, len(matches) / 5.0)
    return round(score, 3), matches

def main():
    print(f"═══ SCAN-ERP-GAPS-RSS · {datetime.now().isoformat()} ═══")
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()

    total_feeds = 0
    total_entries = 0
    total_matches = 0
    total_inserted = 0

    for feed_name, feed_url in RSS_FEEDS:
        print(f"\n━━━ {feed_name} ━━━")
        try:
            # feedparser doesn't always respect timeout; set socket default
            import socket
            socket.setdefaulttimeout(10)
            feed = feedparser.parse(feed_url)
            entries = feed.entries[:30]  # top 30 latest
            total_feeds += 1
            total_entries += len(entries)
            print(f"  → {len(entries)} entries")
        except Exception as e:
            print(f"  [ERR] {e}")
            continue

        feed_matches = 0
        feed_inserted = 0

        for entry in entries:
            title, summary, link = normalize_entry(entry)
            combined = f"{title} {summary}"
            if not combined.strip() or not link:
                continue

            erps = detect_erps_mentioned(combined)
            if not erps:
                continue

            score, kws = score_pain(combined)
            if score < 0.1:  # at least 1 pain keyword
                continue

            feed_matches += 1

            # Insert one row per ERP mentioned
            for erp_id in erps:
                erp_name = [k for k, v in ERP_MAP.items() if v == erp_id and len(k) > 3]
                erp_name = erp_name[0] if erp_name else erp_id
                try:
                    cur.execute("""
                        INSERT INTO erp_gap_scans (erp_id, erp_name, query, source_url, title, snippet, confidence_score, keywords)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                        ON CONFLICT (erp_id, source_url) DO NOTHING
                    """, (
                        erp_id, erp_name,
                        f"rss_{feed_name}",
                        link[:500], title[:500], summary[:1500],
                        score,
                        kws + ["rss", feed_name]
                    ))
                    if cur.rowcount > 0:
                        feed_inserted += 1
                        total_inserted += 1
                except Exception as e:
                    pass

            total_matches += 1

        conn.commit()
        print(f"  matches={feed_matches}, inserted={feed_inserted}")

    cur.close()
    conn.close()
    print(f"\n═══ DONE · feeds={total_feeds} · entries={total_entries} · matches={total_matches} · inserted={total_inserted} ═══")
    return 0

if __name__ == "__main__":
    sys.exit(main())