Files
html/api/scan-erp-gaps-playwright.js

207 lines
6.4 KiB
JavaScript

// WEVAL — ERP Gap Scanner via Playwright (Option B)
// Scrape G2/TrustRadius/Capterra public review pages for the "Cons" / "Dislikes" sections
// Usage: node scan-erp-gaps-playwright.js [erp_id|all]
// Doctrine #5 ON CONFLICT DO NOTHING via unique constraint
const { chromium } = require('playwright');
const { Pool } = require('pg');
const DB = {
host: '10.1.0.3',
port: 5432,
database: 'adx_system',
user: 'admin',
password: 'admin123',
connectionTimeoutMillis: 5000,
};
// Known public review URLs per ERP (G2 and TrustRadius only — publicly accessible)
const ERP_URLS = {
sap_s4hana: [
'https://www.trustradius.com/products/sap-s-4hana/reviews',
],
sap_b1: [
'https://www.trustradius.com/products/sap-business-one/reviews',
],
oracle_netsuite: [
'https://www.trustradius.com/products/netsuite-erp/reviews',
],
oracle_fusion: [
'https://www.trustradius.com/products/oracle-fusion-cloud-erp/reviews',
],
sage_x3: [
'https://www.trustradius.com/products/sage-x3/reviews',
],
sage_intacct: [
'https://www.trustradius.com/products/sage-intacct/reviews',
],
odoo: [
'https://www.trustradius.com/products/odoo/reviews',
],
ms_d365_fo: [
'https://www.trustradius.com/products/microsoft-dynamics-365-finance/reviews',
],
ms_d365_bc: [
'https://www.trustradius.com/products/microsoft-dynamics-365-business-central/reviews',
],
workday: [
'https://www.trustradius.com/products/workday-human-capital-management/reviews',
],
salesforce: [
'https://www.trustradius.com/products/salesforce-sales-cloud/reviews',
],
infor_m3: [
'https://www.trustradius.com/products/infor-cloudsuite-industrial/reviews',
],
ifs: [
'https://www.trustradius.com/products/ifs-cloud/reviews',
],
epicor: [
'https://www.trustradius.com/products/epicor-kinetic/reviews',
],
acumatica: [
'https://www.trustradius.com/products/acumatica-cloud-erp/reviews',
],
deltek: [
'https://www.trustradius.com/products/deltek-costpoint/reviews',
],
servicenow: [
'https://www.trustradius.com/products/servicenow/reviews',
],
veeva: [
'https://www.trustradius.com/products/veeva-vault/reviews',
],
};
const ERP_NAMES = {
sap_s4hana: 'SAP S/4HANA',
sap_b1: 'SAP Business One',
oracle_netsuite: 'Oracle NetSuite',
oracle_fusion: 'Oracle Fusion Cloud',
sage_x3: 'Sage X3',
sage_intacct: 'Sage Intacct',
odoo: 'Odoo',
ms_d365_fo: 'Microsoft Dynamics 365 F&O',
ms_d365_bc: 'Microsoft Dynamics 365 Business Central',
workday: 'Workday',
salesforce: 'Salesforce',
infor_m3: 'Infor M3',
ifs: 'IFS Cloud',
epicor: 'Epicor Kinetic',
acumatica: 'Acumatica Cloud',
deltek: 'Deltek Costpoint',
servicenow: 'ServiceNow',
veeva: 'Veeva Vault',
};
async function scrapeG2(browser, erp_id, url) {
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1280, height: 800 },
});
const page = await context.newPage();
const results = [];
try {
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 25000 });
await page.waitForTimeout(3000);
// Extract "Dislikes" / "Cons" sections from reviews
const extracted = await page.evaluate(() => {
const out = [];
// Strategy 1: look for text following "Dislikes" / "What do you dislike" / "Cons"
const textNodes = Array.from(document.querySelectorAll('p, div[class*="review"], li'));
for (const el of textNodes) {
const txt = (el.innerText || '').trim();
if (!txt || txt.length < 40 || txt.length > 1500) continue;
// Is this a "dislike" / "con" snippet ?
const lc = txt.toLowerCase();
if (
lc.match(/\b(dislike|drawback|limitation|painpoint|pain point|missing|lacks|difficult|slow|complicated|bug|issue|problem|frustrat|workaround|hard to)\b/i)
) {
out.push({
snippet: txt.substring(0, 1000),
title: txt.substring(0, 100),
});
}
}
// Dedupe by first 60 chars
const seen = new Set();
return out.filter(x => {
const k = x.snippet.substring(0, 60);
if (seen.has(k)) return false;
seen.add(k);
return true;
}).slice(0, 15);
});
for (const r of extracted) {
results.push({
erp_id,
erp_name: ERP_NAMES[erp_id] || erp_id,
source_url: `${url}#pain-${Date.now()}-${results.length}`,
title: r.title,
snippet: r.snippet,
confidence: 0.65, // Playwright scraped from public G2 reviews — medium-high
keywords: ['g2_review', 'playwright_scrape'],
});
}
} catch (e) {
console.error(` [ERR] ${url}: ${e.message}`);
} finally {
await page.close();
await context.close();
}
return results;
}
async function storeResults(pool, results) {
let inserted = 0;
for (const r of results) {
try {
const res = await pool.query(
`INSERT INTO erp_gap_scans (erp_id, erp_name, query, source_url, title, snippet, confidence_score, keywords)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT DO NOTHING`,
[r.erp_id, r.erp_name, 'playwright_g2', r.source_url, r.title, r.snippet, r.confidence, r.keywords]
);
if (res.rowCount > 0) inserted++;
} catch (e) {
// ignore
}
}
return inserted;
}
(async () => {
const target = process.argv[2];
console.log(`═══ SCAN-ERP-GAPS-PLAYWRIGHT · ${new Date().toISOString()} ═══`);
const pool = new Pool(DB);
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] });
let totalInserted = 0;
const erpIds = target && ERP_URLS[target] ? [target] : Object.keys(ERP_URLS);
for (const erp_id of erpIds) {
for (const url of ERP_URLS[erp_id]) {
console.log(`\n━━━ ${erp_id} · ${url}`);
const results = await scrapeG2(browser, erp_id, url);
console.log(`${results.length} snippets extracted`);
if (results.length > 0) {
const ins = await storeResults(pool, results);
console.log(`${ins} inserted`);
totalInserted += ins;
}
// Pace between scrapes
await new Promise(r => setTimeout(r, 2000));
}
}
await browser.close();
await pool.end();
console.log(`\n═══ DONE · inserted=${totalInserted} ═══`);
process.exit(0);
})();