fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
201
backend/dist/services/availability.js
vendored
Normal file
201
backend/dist/services/availability.js
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Availability Service
|
||||
*
|
||||
* Normalizes product availability from various menu providers and tracks
|
||||
* state transitions for inventory analytics.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeAvailability = normalizeAvailability;
|
||||
exports.extractAvailabilityHints = extractAvailabilityHints;
|
||||
exports.hintsToAvailability = hintsToAvailability;
|
||||
exports.aggregateAvailability = aggregateAvailability;
|
||||
// Threshold for considering stock as "limited"
|
||||
const LIMITED_THRESHOLD = 5;
|
||||
/**
|
||||
* Normalize availability from a Dutchie product
|
||||
*
|
||||
* Dutchie products can have various availability indicators:
|
||||
* - potencyAmount.quantity: explicit stock count
|
||||
* - status: sometimes includes stock status
|
||||
* - variants[].quantity: stock per variant
|
||||
* - isInStock / inStock: boolean flags
|
||||
*/
|
||||
function normalizeAvailability(dutchieProduct) {
|
||||
const raw = {};
|
||||
// Collect raw availability data for debugging
|
||||
if (dutchieProduct.potencyAmount?.quantity !== undefined) {
|
||||
raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
|
||||
}
|
||||
if (dutchieProduct.status !== undefined) {
|
||||
raw.status = dutchieProduct.status;
|
||||
}
|
||||
if (dutchieProduct.isInStock !== undefined) {
|
||||
raw.isInStock = dutchieProduct.isInStock;
|
||||
}
|
||||
if (dutchieProduct.inStock !== undefined) {
|
||||
raw.inStock = dutchieProduct.inStock;
|
||||
}
|
||||
if (dutchieProduct.variants?.length) {
|
||||
const variantQuantities = dutchieProduct.variants
|
||||
.filter((v) => v.quantity !== undefined)
|
||||
.map((v) => ({ option: v.option, quantity: v.quantity }));
|
||||
if (variantQuantities.length) {
|
||||
raw.variantQuantities = variantQuantities;
|
||||
}
|
||||
}
|
||||
// Try to extract quantity
|
||||
let quantity = null;
|
||||
// Check potencyAmount.quantity first (most reliable for Dutchie)
|
||||
if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
|
||||
quantity = dutchieProduct.potencyAmount.quantity;
|
||||
}
|
||||
// Sum variant quantities if available
|
||||
else if (dutchieProduct.variants?.length) {
|
||||
const totalVariantQty = dutchieProduct.variants.reduce((sum, v) => {
|
||||
return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
|
||||
}, 0);
|
||||
if (totalVariantQty > 0) {
|
||||
quantity = totalVariantQty;
|
||||
}
|
||||
}
|
||||
// Determine status
|
||||
let status = 'unknown';
|
||||
// Explicit boolean flags take precedence
|
||||
if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
|
||||
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||
}
|
||||
// Check status string
|
||||
else if (typeof dutchieProduct.status === 'string') {
|
||||
const statusLower = dutchieProduct.status.toLowerCase();
|
||||
if (statusLower.includes('out') || statusLower.includes('unavailable')) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (statusLower.includes('limited') || statusLower.includes('low')) {
|
||||
status = 'limited';
|
||||
}
|
||||
else if (statusLower.includes('in') || statusLower.includes('available')) {
|
||||
status = 'in_stock';
|
||||
}
|
||||
}
|
||||
// Infer from quantity
|
||||
else if (quantity !== null) {
|
||||
if (quantity === 0) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (quantity <= LIMITED_THRESHOLD) {
|
||||
status = 'limited';
|
||||
}
|
||||
else {
|
||||
status = 'in_stock';
|
||||
}
|
||||
}
|
||||
return { status, quantity, raw };
|
||||
}
|
||||
/**
|
||||
* Extract availability hints from page content or product card HTML
|
||||
*
|
||||
* Used for sandbox provider scraping where we don't have structured data
|
||||
*/
|
||||
function extractAvailabilityHints(pageContent, productElement) {
|
||||
const hints = {};
|
||||
const content = (productElement || pageContent).toLowerCase();
|
||||
// Check for out-of-stock indicators
|
||||
const oosPatterns = [
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'sold out',
|
||||
'soldout',
|
||||
'unavailable',
|
||||
'not available',
|
||||
'coming soon',
|
||||
'notify me'
|
||||
];
|
||||
hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
|
||||
// Check for limited stock indicators
|
||||
const limitedPatterns = [
|
||||
'limited stock',
|
||||
'limited quantity',
|
||||
'low stock',
|
||||
'only \\d+ left',
|
||||
'few remaining',
|
||||
'almost gone',
|
||||
'selling fast'
|
||||
];
|
||||
hints.hasLimitedBadge = limitedPatterns.some(p => {
|
||||
if (p.includes('\\d')) {
|
||||
return new RegExp(p, 'i').test(content);
|
||||
}
|
||||
return content.includes(p);
|
||||
});
|
||||
// Check for in-stock indicators
|
||||
const inStockPatterns = [
|
||||
'in stock',
|
||||
'in-stock',
|
||||
'add to cart',
|
||||
'add to bag',
|
||||
'buy now',
|
||||
'available'
|
||||
];
|
||||
hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
|
||||
// Try to extract quantity text
|
||||
const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
|
||||
if (qtyMatch) {
|
||||
hints.quantityText = qtyMatch[0];
|
||||
}
|
||||
// Look for explicit stock text
|
||||
const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
|
||||
if (stockTextMatch) {
|
||||
hints.stockText = stockTextMatch[0].trim();
|
||||
}
|
||||
return hints;
|
||||
}
|
||||
/**
|
||||
* Convert availability hints to normalized availability
|
||||
*/
|
||||
function hintsToAvailability(hints) {
|
||||
let status = 'unknown';
|
||||
let quantity = null;
|
||||
// Extract quantity if present
|
||||
if (hints.quantityText) {
|
||||
const match = hints.quantityText.match(/(\d+)/);
|
||||
if (match) {
|
||||
quantity = parseInt(match[1], 10);
|
||||
}
|
||||
}
|
||||
// Determine status from hints
|
||||
if (hints.hasOutOfStockBadge) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (hints.hasLimitedBadge) {
|
||||
status = 'limited';
|
||||
}
|
||||
else if (hints.hasInStockBadge) {
|
||||
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||
}
|
||||
return {
|
||||
status,
|
||||
quantity,
|
||||
raw: hints
|
||||
};
|
||||
}
|
||||
function aggregateAvailability(products) {
|
||||
const counts = {
|
||||
in_stock: 0,
|
||||
out_of_stock: 0,
|
||||
limited: 0,
|
||||
unknown: 0,
|
||||
changed: 0
|
||||
};
|
||||
for (const product of products) {
|
||||
const status = product.availability_status || 'unknown';
|
||||
counts[status]++;
|
||||
if (product.previous_status && product.previous_status !== status) {
|
||||
counts.changed++;
|
||||
}
|
||||
}
|
||||
return counts;
|
||||
}
|
||||
1098
backend/dist/services/category-crawler-jobs.js
vendored
Normal file
1098
backend/dist/services/category-crawler-jobs.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
114
backend/dist/services/category-discovery.js
vendored
114
backend/dist/services/category-discovery.js
vendored
@@ -4,9 +4,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.discoverCategories = discoverCategories;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const age_gate_1 = require("../utils/age-gate");
|
||||
const dutchie_1 = require("../scrapers/templates/dutchie");
|
||||
// Apply stealth plugin
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const DUTCHIE_CATEGORIES = [
|
||||
{ name: 'Shop', slug: 'shop' },
|
||||
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
||||
@@ -19,6 +24,18 @@ const DUTCHIE_CATEGORIES = [
|
||||
{ name: 'Brands', slug: 'brands' },
|
||||
{ name: 'Specials', slug: 'specials' }
|
||||
];
|
||||
const CURALEAF_CATEGORIES = [
|
||||
{ name: 'Shop', slug: 'shop' },
|
||||
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
||||
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
||||
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
||||
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
||||
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
||||
{ name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
|
||||
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
||||
{ name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
|
||||
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
|
||||
];
|
||||
async function makePageStealthy(page) {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
@@ -72,7 +89,7 @@ async function discoverCategories(storeId) {
|
||||
const store = storeResult.rows[0];
|
||||
const baseUrl = store.dutchie_url;
|
||||
// Launch browser to check page source
|
||||
browser = await puppeteer_1.default.launch({
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
@@ -85,9 +102,14 @@ async function discoverCategories(storeId) {
|
||||
await makePageStealthy(page);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
||||
const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
|
||||
await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
|
||||
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
|
||||
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
await page.waitForTimeout(3000);
|
||||
// If age gate still appears, try to bypass it
|
||||
await (0, age_gate_1.bypassAgeGate)(page, state);
|
||||
// Detect if it's a Dutchie menu by inspecting page source
|
||||
const isDutchie = await isDutchieMenu(page);
|
||||
await browser.close();
|
||||
@@ -97,8 +119,9 @@ async function discoverCategories(storeId) {
|
||||
await createDutchieCategories(storeId, store);
|
||||
}
|
||||
else {
|
||||
logger_1.logger.info('categories', `⚠️ Non-Dutchie menu detected, would need custom scraping logic`);
|
||||
throw new Error('Non-Dutchie menus not yet supported. Please contact support.');
|
||||
// Fallback: Use standard cannabis categories for non-Dutchie sites
|
||||
logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
|
||||
await createCuraleafCategories(storeId, store);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
@@ -116,24 +139,24 @@ async function createDutchieCategories(storeId, store) {
|
||||
const baseUrl = store.dutchie_url;
|
||||
for (const category of DUTCHIE_CATEGORIES) {
|
||||
let categoryUrl;
|
||||
// Use Dutchie template to build correct category URLs
|
||||
if (category.parentSlug) {
|
||||
// Subcategory: /embedded-menu/{slug}/shop/flower
|
||||
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
|
||||
// Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
|
||||
categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
|
||||
}
|
||||
else {
|
||||
// Top-level: /embedded-menu/{slug}/shop
|
||||
// Top-level: Use base URL with slug
|
||||
categoryUrl = `${baseUrl}/${category.slug}`;
|
||||
}
|
||||
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
|
||||
if (!category.parentSlug) {
|
||||
// Create parent category
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, NULL)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
RETURNING id
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path]);
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', `📁 ${category.name}`);
|
||||
}
|
||||
else {
|
||||
@@ -143,13 +166,12 @@ async function createDutchieCategories(storeId, store) {
|
||||
WHERE store_id = $1 AND slug = $2
|
||||
`, [storeId, category.parentSlug]);
|
||||
if (parentResult.rows.length > 0) {
|
||||
const parentId = parentResult.rows[0].id;
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, $6)
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', ` └── ${category.name}`);
|
||||
}
|
||||
}
|
||||
@@ -166,3 +188,59 @@ async function createDutchieCategories(storeId, store) {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
async function createCuraleafCategories(storeId, store) {
|
||||
const client = await migrate_1.pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
|
||||
const baseUrl = store.dutchie_url;
|
||||
for (const category of CURALEAF_CATEGORIES) {
|
||||
let categoryUrl;
|
||||
if (category.parentSlug) {
|
||||
// Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
|
||||
categoryUrl = `${baseUrl}?category=${category.slug}`;
|
||||
}
|
||||
else {
|
||||
// Top-level category
|
||||
categoryUrl = baseUrl;
|
||||
}
|
||||
if (!category.parentSlug) {
|
||||
// Create parent category
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
RETURNING id
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', `📁 ${category.name}`);
|
||||
}
|
||||
else {
|
||||
// Create subcategory
|
||||
const parentResult = await client.query(`
|
||||
SELECT id FROM categories
|
||||
WHERE store_id = $1 AND slug = $2
|
||||
`, [storeId, category.parentSlug]);
|
||||
if (parentResult.rows.length > 0) {
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', ` └── ${category.name}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
536
backend/dist/services/crawl-scheduler.js
vendored
Normal file
536
backend/dist/services/crawl-scheduler.js
vendored
Normal file
@@ -0,0 +1,536 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Crawl Scheduler Service
|
||||
*
|
||||
* This service manages crawl scheduling using a job queue approach.
|
||||
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
|
||||
*
|
||||
* Features:
|
||||
* - Global schedule: crawl all stores every N hours
|
||||
* - Daily special run: 12:01 AM local store time
|
||||
* - Per-store schedule overrides
|
||||
* - Job queue for tracking pending/running crawls
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getGlobalSchedule = getGlobalSchedule;
|
||||
exports.updateGlobalSchedule = updateGlobalSchedule;
|
||||
exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
|
||||
exports.getStoreSchedule = getStoreSchedule;
|
||||
exports.updateStoreSchedule = updateStoreSchedule;
|
||||
exports.createCrawlJob = createCrawlJob;
|
||||
exports.getPendingJobs = getPendingJobs;
|
||||
exports.claimJob = claimJob;
|
||||
exports.completeJob = completeJob;
|
||||
exports.getRecentJobs = getRecentJobs;
|
||||
exports.getAllRecentJobs = getAllRecentJobs;
|
||||
exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
|
||||
exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
|
||||
exports.processJobs = processJobs;
|
||||
exports.processOrchestrator = processOrchestrator;
|
||||
exports.setSchedulerMode = setSchedulerMode;
|
||||
exports.getSchedulerMode = getSchedulerMode;
|
||||
exports.startCrawlScheduler = startCrawlScheduler;
|
||||
exports.stopCrawlScheduler = stopCrawlScheduler;
|
||||
exports.restartCrawlScheduler = restartCrawlScheduler;
|
||||
exports.triggerManualCrawl = triggerManualCrawl;
|
||||
exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
|
||||
exports.cancelJob = cancelJob;
|
||||
const node_cron_1 = __importDefault(require("node-cron"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const scraper_v2_1 = require("../scraper-v2");
|
||||
const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
|
||||
// Worker identification
|
||||
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
|
||||
let schedulerCronJob = null;
|
||||
let jobProcessorRunning = false;
|
||||
let orchestratorProcessorRunning = false;
|
||||
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
|
||||
let schedulerMode = 'orchestrator';
|
||||
// ============================================
|
||||
// Schedule Management
|
||||
// ============================================
|
||||
/**
|
||||
* Get global schedule settings
|
||||
*/
|
||||
async function getGlobalSchedule() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule ORDER BY id
|
||||
`);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Update global schedule setting
|
||||
*/
|
||||
async function updateGlobalSchedule(scheduleType, updates) {
|
||||
const setClauses = [];
|
||||
const values = [];
|
||||
let paramIndex = 1;
|
||||
if (updates.enabled !== undefined) {
|
||||
setClauses.push(`enabled = $${paramIndex++}`);
|
||||
values.push(updates.enabled);
|
||||
}
|
||||
if (updates.interval_hours !== undefined) {
|
||||
setClauses.push(`interval_hours = $${paramIndex++}`);
|
||||
values.push(updates.interval_hours);
|
||||
}
|
||||
if (updates.run_time !== undefined) {
|
||||
setClauses.push(`run_time = $${paramIndex++}`);
|
||||
values.push(updates.run_time);
|
||||
}
|
||||
values.push(scheduleType);
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawler_schedule
|
||||
SET ${setClauses.join(', ')}
|
||||
WHERE schedule_type = $${paramIndex}
|
||||
RETURNING *
|
||||
`, values);
|
||||
return result.rows[0];
|
||||
}
|
||||
/**
|
||||
* Get all store schedule statuses
|
||||
*/
|
||||
async function getStoreScheduleStatuses() {
|
||||
const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Get or create per-store schedule override
|
||||
*/
|
||||
async function getStoreSchedule(storeId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM store_crawl_schedule WHERE store_id = $1
|
||||
`, [storeId]);
|
||||
if (result.rows.length > 0) {
|
||||
return result.rows[0];
|
||||
}
|
||||
// Return default (use global)
|
||||
return {
|
||||
store_id: storeId,
|
||||
enabled: true,
|
||||
interval_hours: null,
|
||||
daily_special_enabled: true,
|
||||
daily_special_time: null,
|
||||
priority: 0
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Update per-store schedule override
|
||||
*/
|
||||
async function updateStoreSchedule(storeId, updates) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
ON CONFLICT (store_id) DO UPDATE SET
|
||||
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
|
||||
interval_hours = EXCLUDED.interval_hours,
|
||||
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
|
||||
daily_special_time = EXCLUDED.daily_special_time,
|
||||
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
`, [
|
||||
storeId,
|
||||
updates.enabled ?? true,
|
||||
updates.interval_hours ?? null,
|
||||
updates.daily_special_enabled ?? true,
|
||||
updates.daily_special_time ?? null,
|
||||
updates.priority ?? 0
|
||||
]);
|
||||
return result.rows[0];
|
||||
}
|
||||
// ============================================
|
||||
// Job Queue Management
|
||||
// ============================================
|
||||
/**
|
||||
* Create a new crawl job
|
||||
*/
|
||||
async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
|
||||
// Check if there's already a pending or running job for this store
|
||||
const existing = await migrate_1.pool.query(`
|
||||
SELECT id FROM crawl_jobs
|
||||
WHERE store_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1
|
||||
`, [storeId]);
|
||||
if (existing.rows.length > 0) {
|
||||
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
|
||||
return existing.rows[0];
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
|
||||
VALUES ($1, $2, $3, $4, $5, 'pending')
|
||||
RETURNING *
|
||||
`, [storeId, jobType, triggerType, scheduledAt, priority]);
|
||||
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
|
||||
return result.rows[0];
|
||||
}
|
||||
/**
|
||||
* Get pending jobs ready to run
|
||||
*/
|
||||
async function getPendingJobs(limit = 5) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT cj.*, s.name as store_name
|
||||
FROM crawl_jobs cj
|
||||
JOIN stores s ON s.id = cj.store_id
|
||||
WHERE cj.status = 'pending'
|
||||
AND cj.scheduled_at <= NOW()
|
||||
ORDER BY cj.priority DESC, cj.scheduled_at ASC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Claim a job for processing
|
||||
*/
|
||||
async function claimJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET status = 'running', started_at = NOW(), worker_id = $2
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING id
|
||||
`, [jobId, WORKER_ID]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
/**
|
||||
* Complete a job
|
||||
*/
|
||||
async function completeJob(jobId, success, results) {
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET
|
||||
status = $2,
|
||||
completed_at = NOW(),
|
||||
products_found = $3,
|
||||
error_message = $4
|
||||
WHERE id = $1
|
||||
`, [
|
||||
jobId,
|
||||
success ? 'completed' : 'failed',
|
||||
results?.products_found ?? null,
|
||||
results?.error_message ?? null
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* Get recent jobs for a store
|
||||
*/
|
||||
async function getRecentJobs(storeId, limit = 10) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawl_jobs
|
||||
WHERE store_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2
|
||||
`, [storeId, limit]);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Get all recent jobs
|
||||
*/
|
||||
async function getAllRecentJobs(limit = 50) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT cj.*, s.name as store_name, s.slug as store_slug
|
||||
FROM crawl_jobs cj
|
||||
JOIN stores s ON s.id = cj.store_id
|
||||
ORDER BY cj.created_at DESC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
return result.rows;
|
||||
}
|
||||
// ============================================
|
||||
// Scheduler Logic
|
||||
// ============================================
|
||||
/**
|
||||
* Check which stores are due for a crawl and create jobs
|
||||
*/
|
||||
async function checkAndCreateScheduledJobs() {
|
||||
console.log('Checking for stores due for crawl...');
|
||||
// Get global schedule settings
|
||||
const globalSchedule = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
|
||||
`);
|
||||
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
|
||||
console.log('Global scheduler is disabled');
|
||||
return 0;
|
||||
}
|
||||
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
|
||||
// Find stores due for crawl
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
s.id,
|
||||
s.name,
|
||||
s.timezone,
|
||||
s.last_scraped_at,
|
||||
COALESCE(scs.enabled, TRUE) as schedule_enabled,
|
||||
COALESCE(scs.interval_hours, $1) as interval_hours,
|
||||
COALESCE(scs.priority, 0) as priority
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||
AND (
|
||||
s.last_scraped_at IS NULL
|
||||
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
|
||||
`, [intervalHours]);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
try {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
|
||||
jobsCreated++;
|
||||
console.log(`Scheduled crawl job for: ${store.name}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Failed to create job for store ${store.name}:`, error);
|
||||
}
|
||||
}
|
||||
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Check for daily special runs (12:01 AM local time)
|
||||
*/
|
||||
async function checkAndCreateDailySpecialJobs() {
|
||||
console.log('Checking for daily special runs...');
|
||||
// Get daily special schedule
|
||||
const dailySchedule = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
|
||||
`);
|
||||
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
|
||||
console.log('Daily special scheduler is disabled');
|
||||
return 0;
|
||||
}
|
||||
const targetTime = dailySchedule.rows[0].run_time || '00:01';
|
||||
// Find stores where it's currently the target time in their local timezone
|
||||
// and they haven't had a daily special run today
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
s.id,
|
||||
s.name,
|
||||
s.timezone,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
|
||||
COALESCE(scs.priority, 0) as priority
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
|
||||
-- Check if current time in store timezone matches the target time (within 2 minutes)
|
||||
AND ABS(
|
||||
EXTRACT(EPOCH FROM (
|
||||
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
|
||||
- COALESCE(scs.daily_special_time, $1::TIME)
|
||||
))
|
||||
) < 120 -- within 2 minutes
|
||||
-- Ensure we haven't already created a daily_special job today for this store
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id
|
||||
AND cj.trigger_type = 'daily_special'
|
||||
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC
|
||||
`, [targetTime]);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
try {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
|
||||
jobsCreated++;
|
||||
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Failed to create daily special job for store ${store.name}:`, error);
|
||||
}
|
||||
}
|
||||
if (jobsCreated > 0) {
|
||||
console.log(`Created ${jobsCreated} daily special crawl jobs`);
|
||||
}
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Process pending jobs
|
||||
*/
|
||||
async function processJobs() {
|
||||
if (jobProcessorRunning) {
|
||||
console.log('Job processor already running, skipping...');
|
||||
return;
|
||||
}
|
||||
jobProcessorRunning = true;
|
||||
try {
|
||||
const jobs = await getPendingJobs(1); // Process one at a time for safety
|
||||
for (const job of jobs) {
|
||||
console.log(`Processing job ${job.id} for store: ${job.store_name}`);
|
||||
const claimed = await claimJob(job.id);
|
||||
if (!claimed) {
|
||||
console.log(`Job ${job.id} already claimed by another worker`);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
|
||||
await (0, scraper_v2_1.scrapeStore)(job.store_id);
|
||||
// Update store's last_scraped_at
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
|
||||
`, [job.store_id]);
|
||||
await completeJob(job.id, true, {});
|
||||
console.log(`Job ${job.id} completed successfully`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Job ${job.id} failed:`, error);
|
||||
await completeJob(job.id, false, { error_message: error.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
jobProcessorRunning = false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process stores using the intelligent orchestrator
|
||||
* This replaces the simple job queue approach with intelligent provider detection
|
||||
*/
|
||||
async function processOrchestrator() {
|
||||
if (orchestratorProcessorRunning) {
|
||||
console.log('Orchestrator processor already running, skipping...');
|
||||
return;
|
||||
}
|
||||
orchestratorProcessorRunning = true;
|
||||
try {
|
||||
// Get stores due for orchestration (respects schedule, intervals, etc.)
|
||||
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
|
||||
if (storeIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
|
||||
// Process each store through the orchestrator
|
||||
for (const storeId of storeIds) {
|
||||
try {
|
||||
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
|
||||
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
|
||||
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
|
||||
}
|
||||
finally {
|
||||
orchestratorProcessorRunning = false;
|
||||
}
|
||||
}
|
||||
// ============================================
|
||||
// Scheduler Control
|
||||
// ============================================
|
||||
/**
|
||||
* Set scheduler mode
|
||||
*/
|
||||
function setSchedulerMode(mode) {
|
||||
schedulerMode = mode;
|
||||
console.log(`Scheduler mode set to: ${mode}`);
|
||||
}
|
||||
/**
|
||||
* Get current scheduler mode
|
||||
*/
|
||||
function getSchedulerMode() {
|
||||
return schedulerMode;
|
||||
}
|
||||
/**
|
||||
* Start the scheduler (runs every minute to check for due jobs)
|
||||
*/
|
||||
async function startCrawlScheduler() {
|
||||
stopCrawlScheduler();
|
||||
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
|
||||
// Run every minute
|
||||
schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
|
||||
try {
|
||||
if (schedulerMode === 'orchestrator') {
|
||||
// Use intelligent orchestrator (handles detection + crawl)
|
||||
await processOrchestrator();
|
||||
}
|
||||
else {
|
||||
// Legacy mode: job queue approach
|
||||
// Check for interval-based scheduled jobs
|
||||
await checkAndCreateScheduledJobs();
|
||||
// Check for daily special runs
|
||||
await checkAndCreateDailySpecialJobs();
|
||||
// Process any pending jobs
|
||||
await processJobs();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Scheduler tick error:', error);
|
||||
}
|
||||
});
|
||||
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
|
||||
}
|
||||
/**
|
||||
* Stop the scheduler
|
||||
*/
|
||||
function stopCrawlScheduler() {
|
||||
if (schedulerCronJob) {
|
||||
schedulerCronJob.stop();
|
||||
schedulerCronJob = null;
|
||||
console.log('Crawl scheduler stopped');
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Restart the scheduler
|
||||
*/
|
||||
async function restartCrawlScheduler() {
|
||||
await startCrawlScheduler();
|
||||
}
|
||||
// ============================================
|
||||
// Manual Triggers
|
||||
// ============================================
|
||||
/**
|
||||
* Manually trigger a crawl for a specific store (creates a job immediately)
|
||||
*/
|
||||
async function triggerManualCrawl(storeId) {
|
||||
console.log(`Manual crawl triggered for store ID: ${storeId}`);
|
||||
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
|
||||
}
|
||||
/**
|
||||
* Manually trigger crawls for all stores
|
||||
*/
|
||||
async function triggerAllStoresCrawl() {
|
||||
console.log('Manual crawl triggered for all stores');
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, name FROM stores
|
||||
WHERE active = TRUE AND scrape_enabled = TRUE
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
`);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
|
||||
jobsCreated++;
|
||||
}
|
||||
console.log(`Created ${jobsCreated} manual crawl jobs`);
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Cancel a pending job
|
||||
*/
|
||||
async function cancelJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET status = 'cancelled'
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING id
|
||||
`, [jobId]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
476
backend/dist/services/crawler-jobs.js
vendored
Normal file
476
backend/dist/services/crawler-jobs.js
vendored
Normal file
@@ -0,0 +1,476 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Crawler Jobs Service
|
||||
*
|
||||
* Handles three types of jobs:
|
||||
* 1. DetectMenuProviderJob - Detect menu provider for a dispensary
|
||||
* 2. DutchieMenuCrawlJob - Production Dutchie crawl
|
||||
* 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runDetectMenuProviderJob = runDetectMenuProviderJob;
|
||||
exports.runDutchieMenuCrawlJob = runDutchieMenuCrawlJob;
|
||||
exports.runSandboxCrawlJob = runSandboxCrawlJob;
|
||||
exports.processSandboxJobs = processSandboxJobs;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const menu_provider_detector_1 = require("./menu-provider-detector");
|
||||
const scraper_v2_1 = require("../scraper-v2");
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const fs_1 = require("fs");
|
||||
const path_1 = __importDefault(require("path"));
|
||||
const availability_1 = require("./availability");
|
||||
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getDispensary(dispensaryId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
|
||||
crawler_mode, crawler_status, scraper_template
|
||||
FROM dispensaries WHERE id = $1`, [dispensaryId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function updateDispensary(dispensaryId, updates) {
|
||||
const setClauses = [];
|
||||
const values = [];
|
||||
let paramIndex = 1;
|
||||
for (const [key, value] of Object.entries(updates)) {
|
||||
setClauses.push(`${key} = $${paramIndex}`);
|
||||
values.push(value);
|
||||
paramIndex++;
|
||||
}
|
||||
setClauses.push(`updated_at = NOW()`);
|
||||
values.push(dispensaryId);
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`, values);
|
||||
}
|
||||
async function createSandboxEntry(dispensaryId, suspectedProvider, mode, detectionSignals) {
|
||||
// First, check if there's an existing active sandbox
|
||||
const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
|
||||
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId]);
|
||||
if (existing.rows.length > 0) {
|
||||
// Update existing
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes
|
||||
SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
|
||||
WHERE id = $1`, [existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]);
|
||||
return existing.rows[0].id;
|
||||
}
|
||||
// Create new
|
||||
const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
RETURNING id`, [dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
async function createSandboxJob(dispensaryId, sandboxId, jobType, priority = 0) {
|
||||
const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
||||
VALUES ($1, $2, $3, 'pending', $4)
|
||||
RETURNING id`, [dispensaryId, sandboxId, jobType, priority]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
// Get linked store ID for a dispensary (for using existing scraper)
|
||||
async function getStoreIdForDispensary(dispensaryId) {
|
||||
// Check if there's a stores entry linked to this dispensary
|
||||
const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
|
||||
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensaryId]);
|
||||
if (result.rows.length > 0) {
|
||||
return result.rows[0].id;
|
||||
}
|
||||
// Try to find by website
|
||||
const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
|
||||
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensaryId]);
|
||||
return result2.rows[0]?.id || null;
|
||||
}
|
||||
// ========================================
|
||||
// Job 1: Detect Menu Provider
|
||||
// ========================================
|
||||
async function runDetectMenuProviderJob(dispensaryId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Check for website URL
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (!websiteUrl) {
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: 'No website URL available for detection',
|
||||
});
|
||||
return { success: false, message: 'No website URL available' };
|
||||
}
|
||||
try {
|
||||
// Run detection
|
||||
const detection = await (0, menu_provider_detector_1.detectMenuProvider)(websiteUrl, {
|
||||
checkMenuPaths: true,
|
||||
timeout: 30000,
|
||||
});
|
||||
// Update dispensary with results
|
||||
const updates = {
|
||||
menu_provider: detection.provider,
|
||||
menu_provider_confidence: detection.confidence,
|
||||
provider_detection_data: JSON.stringify({
|
||||
signals: detection.signals,
|
||||
urlsTested: detection.urlsTested,
|
||||
menuEntryPoints: detection.menuEntryPoints,
|
||||
rawSignals: detection.rawSignals,
|
||||
detectedAt: new Date().toISOString(),
|
||||
}),
|
||||
crawler_status: 'idle',
|
||||
};
|
||||
// Decide crawler mode based on provider
|
||||
if (detection.provider === 'dutchie' && detection.confidence >= 70) {
|
||||
// Dutchie with high confidence -> production
|
||||
updates.crawler_mode = 'production';
|
||||
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
|
||||
}
|
||||
else {
|
||||
// Unknown or non-Dutchie -> sandbox
|
||||
updates.crawler_mode = 'sandbox';
|
||||
// Create sandbox entry for further analysis
|
||||
const sandboxId = await createSandboxEntry(dispensaryId, detection.provider, 'detection', {
|
||||
signals: detection.signals,
|
||||
rawSignals: detection.rawSignals,
|
||||
});
|
||||
// Queue sandbox crawl job
|
||||
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
|
||||
}
|
||||
// Update menu entry points if found
|
||||
if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
|
||||
updates.menu_url = detection.menuEntryPoints[0];
|
||||
}
|
||||
await updateDispensary(dispensaryId, updates);
|
||||
return {
|
||||
success: true,
|
||||
message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
|
||||
data: {
|
||||
provider: detection.provider,
|
||||
confidence: detection.confidence,
|
||||
mode: updates.crawler_mode,
|
||||
menuEntryPoints: detection.menuEntryPoints,
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Detection failed: ${error.message}`,
|
||||
});
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Job 2: Dutchie Menu Crawl (Production)
|
||||
// ========================================
|
||||
async function runDutchieMenuCrawlJob(dispensaryId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Verify it's a Dutchie production dispensary
|
||||
if (dispensary.menu_provider !== 'dutchie') {
|
||||
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
|
||||
return { success: false, message: 'Not a Dutchie dispensary' };
|
||||
}
|
||||
if (dispensary.crawler_mode !== 'production') {
|
||||
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
|
||||
return { success: false, message: 'Not in production mode' };
|
||||
}
|
||||
// Find linked store ID
|
||||
const storeId = await getStoreIdForDispensary(dispensaryId);
|
||||
if (!storeId) {
|
||||
// Need to create a store entry or handle differently
|
||||
logger_1.logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
|
||||
return { success: false, message: 'No linked store found - needs setup' };
|
||||
}
|
||||
try {
|
||||
// Update status to running
|
||||
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||
// Run the existing Dutchie scraper
|
||||
await (0, scraper_v2_1.scrapeStore)(storeId, 3); // 3 parallel workers
|
||||
// Update success status
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'ok',
|
||||
last_menu_scrape: new Date(),
|
||||
menu_scrape_status: 'active',
|
||||
});
|
||||
logger_1.logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
success: true,
|
||||
message: 'Dutchie crawl completed successfully',
|
||||
data: { storeId },
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
// Check if this might be a provider change
|
||||
let providerChanged = false;
|
||||
try {
|
||||
const browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox'] });
|
||||
const page = await browser.newPage();
|
||||
const url = dispensary.menu_url || dispensary.website;
|
||||
if (url) {
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||
const changeResult = await (0, menu_provider_detector_1.detectProviderChange)(page, 'dutchie');
|
||||
providerChanged = changeResult.changed;
|
||||
if (providerChanged) {
|
||||
// Provider changed - move to sandbox
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_mode: 'sandbox',
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
|
||||
});
|
||||
const sandboxId = await createSandboxEntry(dispensaryId, changeResult.newProvider || 'unknown', 'detection', { providerChangeDetected: true, previousProvider: 'dutchie' });
|
||||
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||
logger_1.logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
|
||||
}
|
||||
}
|
||||
await browser.close();
|
||||
}
|
||||
catch {
|
||||
// Ignore detection errors during failure handling
|
||||
}
|
||||
if (!providerChanged) {
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: error.message,
|
||||
});
|
||||
}
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Job 3: Sandbox Crawl (Learning Mode)
|
||||
// ========================================
|
||||
async function runSandboxCrawlJob(dispensaryId, sandboxId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Get or create sandbox entry
|
||||
let sandbox;
|
||||
if (sandboxId) {
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
|
||||
sandbox = result.rows[0];
|
||||
}
|
||||
else {
|
||||
const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
|
||||
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
|
||||
ORDER BY created_at DESC LIMIT 1`, [dispensaryId]);
|
||||
sandbox = result.rows[0];
|
||||
if (!sandbox) {
|
||||
const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
|
||||
sandbox = result.rows[0];
|
||||
}
|
||||
}
|
||||
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||
if (!websiteUrl) {
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
|
||||
return { success: false, message: 'No website URL available' };
|
||||
}
|
||||
let browser = null;
|
||||
try {
|
||||
// Update status
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
|
||||
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||
// Launch browser
|
||||
browser = await puppeteer_1.default.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// URLs to crawl (limited depth for sandbox)
|
||||
const urlsToVisit = [websiteUrl];
|
||||
const menuPaths = ['/menu', '/shop', '/products', '/order'];
|
||||
for (const path of menuPaths) {
|
||||
const baseUrl = new URL(websiteUrl).origin;
|
||||
urlsToVisit.push(`${baseUrl}${path}`);
|
||||
}
|
||||
const urlsTested = [];
|
||||
const menuEntryPoints = [];
|
||||
const capturedHtml = [];
|
||||
const analysisData = {
|
||||
provider_signals: {},
|
||||
selector_candidates: [],
|
||||
page_structures: [],
|
||||
};
|
||||
// Crawl each URL
|
||||
for (const url of urlsToVisit) {
|
||||
try {
|
||||
urlsTested.push(url);
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||
await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
// Check if this looks like a menu page
|
||||
const hasMenuContent = await page.evaluate(() => {
|
||||
const text = document.body.innerText.toLowerCase();
|
||||
return (text.includes('add to cart') ||
|
||||
text.includes('thc') ||
|
||||
text.includes('indica') ||
|
||||
text.includes('sativa'));
|
||||
});
|
||||
if (hasMenuContent) {
|
||||
menuEntryPoints.push(url);
|
||||
capturedHtml.push({ url, html });
|
||||
// Analyze page structure for selector candidates
|
||||
const structure = await page.evaluate(() => {
|
||||
const candidates = [];
|
||||
// Look for product-like containers
|
||||
const productSelectors = [
|
||||
'.product', '.product-card', '.menu-item', '.item-card',
|
||||
'[data-product]', '[data-item]', '.strain', '.listing',
|
||||
];
|
||||
for (const selector of productSelectors) {
|
||||
const els = document.querySelectorAll(selector);
|
||||
if (els.length > 3) { // Likely a list
|
||||
candidates.push({
|
||||
selector,
|
||||
count: els.length,
|
||||
type: 'product_container',
|
||||
});
|
||||
}
|
||||
}
|
||||
// Look for price patterns
|
||||
const pricePattern = /\$\d+(\.\d{2})?/;
|
||||
const textNodes = document.body.innerText;
|
||||
const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
|
||||
return {
|
||||
candidates,
|
||||
priceCount: priceMatches?.length || 0,
|
||||
hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
|
||||
};
|
||||
});
|
||||
// Extract availability hints from page content
|
||||
const availabilityHints = (0, availability_1.extractAvailabilityHints)(html);
|
||||
analysisData.page_structures.push({
|
||||
url,
|
||||
...structure,
|
||||
availabilityHints,
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (pageError) {
|
||||
if (!pageError.message.includes('404')) {
|
||||
logger_1.logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save HTML to storage (local for now, S3 later)
|
||||
let rawHtmlLocation = null;
|
||||
if (capturedHtml.length > 0) {
|
||||
const htmlDir = path_1.default.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
|
||||
await fs_1.promises.mkdir(htmlDir, { recursive: true });
|
||||
for (const { url, html } of capturedHtml) {
|
||||
const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
|
||||
await fs_1.promises.writeFile(path_1.default.join(htmlDir, filename), html);
|
||||
}
|
||||
rawHtmlLocation = htmlDir;
|
||||
}
|
||||
// Update sandbox with results
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
|
||||
status = $1,
|
||||
urls_tested = $2,
|
||||
menu_entry_points = $3,
|
||||
raw_html_location = $4,
|
||||
analysis_json = $5,
|
||||
confidence_score = $6,
|
||||
analyzed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7`, [
|
||||
menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
|
||||
JSON.stringify(urlsTested),
|
||||
JSON.stringify(menuEntryPoints),
|
||||
rawHtmlLocation,
|
||||
JSON.stringify(analysisData),
|
||||
menuEntryPoints.length > 0 ? 50 : 20,
|
||||
sandbox.id,
|
||||
]);
|
||||
// Update dispensary status
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review', // Sandbox results need review
|
||||
});
|
||||
logger_1.logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
|
||||
return {
|
||||
success: true,
|
||||
message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
|
||||
data: {
|
||||
sandboxId: sandbox.id,
|
||||
urlsTested: urlsTested.length,
|
||||
menuEntryPoints,
|
||||
analysisData,
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Sandbox crawl failed: ${error.message}`,
|
||||
});
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Queue Processing Functions
|
||||
// ========================================
|
||||
/**
|
||||
* Process pending sandbox jobs
|
||||
*/
|
||||
async function processSandboxJobs(limit = 5) {
|
||||
// Claim pending jobs
|
||||
const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = 'running', worker_id = $1, started_at = NOW()
|
||||
WHERE id IN (
|
||||
SELECT id FROM sandbox_crawl_jobs
|
||||
WHERE status = 'pending' AND scheduled_at <= NOW()
|
||||
ORDER BY priority DESC, scheduled_at ASC
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING *`, [WORKER_ID, limit]);
|
||||
for (const job of jobs.rows) {
|
||||
try {
|
||||
let result;
|
||||
if (job.job_type === 'detection') {
|
||||
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||||
}
|
||||
else {
|
||||
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||||
}
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||
WHERE id = $4`, [
|
||||
result.success ? 'completed' : 'failed',
|
||||
JSON.stringify(result.data || {}),
|
||||
result.success ? null : result.message,
|
||||
job.id,
|
||||
]);
|
||||
}
|
||||
catch (error) {
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
202
backend/dist/services/crawler-logger.js
vendored
Normal file
202
backend/dist/services/crawler-logger.js
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
"use strict";
|
||||
/**
|
||||
* CrawlerLogger - Structured logging for crawler operations
|
||||
*
|
||||
* High-signal, low-noise logging with JSON output for:
|
||||
* - Job lifecycle (one summary per job)
|
||||
* - Provider/mode changes
|
||||
* - Sandbox events
|
||||
* - Queue failures
|
||||
*
|
||||
* NO per-product logging - that's too noisy.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.crawlerLogger = void 0;
|
||||
class CrawlerLoggerService {
|
||||
formatLog(payload) {
|
||||
return JSON.stringify(payload);
|
||||
}
|
||||
log(payload) {
|
||||
const formatted = this.formatLog(payload);
|
||||
switch (payload.level) {
|
||||
case 'error':
|
||||
console.error(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
case 'warn':
|
||||
console.warn(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
case 'debug':
|
||||
console.debug(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
default:
|
||||
console.log(`[CRAWLER] ${formatted}`);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job starts
|
||||
*/
|
||||
jobStarted(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'job_started',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
job_type: params.job_type,
|
||||
trigger_type: params.trigger_type,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job completes successfully
|
||||
*/
|
||||
jobCompleted(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'job_completed',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
duration_ms: params.duration_ms,
|
||||
products_found: params.products_found,
|
||||
products_new: params.products_new,
|
||||
products_updated: params.products_updated,
|
||||
products_marked_oos: params.products_marked_oos,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job fails
|
||||
*/
|
||||
jobFailed(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'error',
|
||||
event: 'job_failed',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
duration_ms: params.duration_ms,
|
||||
error_message: params.error_message,
|
||||
error_code: params.error_code,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a provider is detected for a dispensary
|
||||
*/
|
||||
providerDetected(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'provider_detected',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
detected_provider: params.detected_provider,
|
||||
confidence: params.confidence,
|
||||
detection_method: params.detection_method,
|
||||
menu_url: params.menu_url,
|
||||
category: params.category,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a dispensary's provider changes
|
||||
*/
|
||||
providerChanged(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'provider_changed',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
old_provider: params.old_provider,
|
||||
new_provider: params.new_provider,
|
||||
old_confidence: params.old_confidence,
|
||||
new_confidence: params.new_confidence,
|
||||
category: params.category,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
|
||||
*/
|
||||
modeChanged(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'mode_changed',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
old_mode: params.old_mode,
|
||||
new_mode: params.new_mode,
|
||||
reason: params.reason,
|
||||
category: params.category,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log sandbox crawl events
|
||||
*/
|
||||
sandboxEvent(params) {
|
||||
const level = params.event === 'sandbox_failed' ? 'error' : 'info';
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level,
|
||||
event: params.event,
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
template_name: params.template_name,
|
||||
category: params.category,
|
||||
quality_score: params.quality_score,
|
||||
products_extracted: params.products_extracted,
|
||||
fields_missing: params.fields_missing,
|
||||
error_message: params.error_message,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log queue processing failures
|
||||
*/
|
||||
queueFailure(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'error',
|
||||
event: 'queue_failure',
|
||||
queue_type: params.queue_type,
|
||||
error_message: params.error_message,
|
||||
affected_items: params.affected_items,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log detection scan summary
|
||||
*/
|
||||
detectionScan(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'detection_scan',
|
||||
total_scanned: params.total_scanned,
|
||||
detected: params.detected,
|
||||
failed: params.failed,
|
||||
skipped: params.skipped,
|
||||
duration_ms: params.duration_ms,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log intelligence run summary
|
||||
*/
|
||||
intelligenceRun(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'intelligence_run',
|
||||
run_type: params.run_type,
|
||||
dispensaries_processed: params.dispensaries_processed,
|
||||
jobs_queued: params.jobs_queued,
|
||||
duration_ms: params.duration_ms,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Export singleton instance
|
||||
exports.crawlerLogger = new CrawlerLoggerService();
|
||||
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
@@ -0,0 +1,383 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dispensary Crawl Orchestrator
|
||||
*
|
||||
* Orchestrates the complete crawl workflow for a dispensary:
|
||||
* 1. Load dispensary data
|
||||
* 2. Check if provider detection is needed
|
||||
* 3. Run provider detection if needed
|
||||
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||
* 5. Update dispensary_crawl_schedule with meaningful status
|
||||
*
|
||||
* This works DIRECTLY with dispensaries (not through stores table).
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runDispensaryOrchestrator = runDispensaryOrchestrator;
|
||||
exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator;
|
||||
exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration;
|
||||
exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules;
|
||||
exports.processDispensaryScheduler = processDispensaryScheduler;
|
||||
const uuid_1 = require("uuid");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_logger_1 = require("./crawler-logger");
|
||||
const intelligence_detector_1 = require("./intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
||||
// ========================================
|
||||
// Main Orchestrator Function
|
||||
// ========================================
|
||||
/**
|
||||
* Run the complete crawl orchestration for a dispensary
|
||||
*
|
||||
* Behavior:
|
||||
* 1. Load the dispensary info
|
||||
* 2. If product_provider is missing or stale (>7 days), run detection
|
||||
* 3. After detection:
|
||||
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||
* - Otherwise: Run sandbox crawl
|
||||
* 4. Update dispensary_crawl_schedule with status/summary
|
||||
*/
|
||||
async function runDispensaryOrchestrator(dispensaryId, scheduleId) {
|
||||
const startTime = Date.now();
|
||||
const runId = (0, uuid_1.v4)();
|
||||
let result = {
|
||||
status: 'pending',
|
||||
summary: '',
|
||||
runId,
|
||||
dispensaryId,
|
||||
dispensaryName: '',
|
||||
detectionRan: false,
|
||||
crawlRan: false,
|
||||
durationMs: 0,
|
||||
};
|
||||
try {
|
||||
// Mark schedule as running
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId);
|
||||
// 1. Load dispensary info
|
||||
const dispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (!dispensary) {
|
||||
throw new Error(`Dispensary ${dispensaryId} not found`);
|
||||
}
|
||||
result.dispensaryName = dispensary.name;
|
||||
// 2. Check if provider detection is needed
|
||||
const needsDetection = await checkNeedsDetection(dispensary);
|
||||
if (needsDetection) {
|
||||
// Run provider detection
|
||||
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||
if (!websiteUrl) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No website URL available for detection';
|
||||
result.error = 'Dispensary has no menu_url or website configured';
|
||||
await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId);
|
||||
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
result.detectionRan = true;
|
||||
result.detectionResult = detectionResult;
|
||||
// Save detection results to dispensary
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult);
|
||||
crawler_logger_1.crawlerLogger.providerDetected({
|
||||
dispensary_id: dispensaryId,
|
||||
dispensary_name: dispensary.name,
|
||||
detected_provider: detectionResult.product.provider,
|
||||
confidence: detectionResult.product.confidence,
|
||||
detection_method: 'dispensary_orchestrator',
|
||||
menu_url: websiteUrl,
|
||||
category: 'product',
|
||||
});
|
||||
// Refresh dispensary info after detection
|
||||
const updatedDispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (updatedDispensary) {
|
||||
Object.assign(dispensary, updatedDispensary);
|
||||
}
|
||||
}
|
||||
// 3. Determine crawl type and run
|
||||
const provider = dispensary.product_provider;
|
||||
const mode = dispensary.product_crawler_mode;
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId);
|
||||
try {
|
||||
// Run the category-specific crawl job
|
||||
const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
if (crawlResult.success) {
|
||||
result.productsFound = crawlResult.data?.productsFound || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie products crawl completed`;
|
||||
result.status = 'success';
|
||||
crawler_logger_1.crawlerLogger.jobCompleted({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
products_found: result.productsFound || 0,
|
||||
products_new: 0,
|
||||
products_updated: 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
else {
|
||||
result.status = 'error';
|
||||
result.error = crawlResult.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`;
|
||||
}
|
||||
}
|
||||
catch (crawlError) {
|
||||
result.status = 'error';
|
||||
result.error = crawlError.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
crawler_logger_1.crawlerLogger.jobFailed({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_message: crawlError.message,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (provider && provider !== 'unknown') {
|
||||
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||
await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId);
|
||||
try {
|
||||
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
if (sandboxResult.success) {
|
||||
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||
result.status = 'sandbox_only';
|
||||
}
|
||||
else {
|
||||
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||
result.status = 'error';
|
||||
result.error = sandboxResult.message;
|
||||
}
|
||||
}
|
||||
catch (sandboxError) {
|
||||
result.status = 'error';
|
||||
result.error = sandboxError.message;
|
||||
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No provider detected - detection only
|
||||
if (result.detectionRan) {
|
||||
result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`;
|
||||
result.status = 'detection_only';
|
||||
}
|
||||
else {
|
||||
result.summary = 'No provider detected and no crawl possible';
|
||||
result.status = 'error';
|
||||
result.error = 'Could not determine menu provider';
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.status = 'error';
|
||||
result.error = error.message;
|
||||
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||
crawler_logger_1.crawlerLogger.queueFailure({
|
||||
queue_type: 'dispensary_orchestrator',
|
||||
error_message: error.message,
|
||||
});
|
||||
}
|
||||
result.durationMs = Date.now() - startTime;
|
||||
// Update final schedule status
|
||||
await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId);
|
||||
// Create job record
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getDispensaryInfo(dispensaryId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url,
|
||||
product_provider, product_confidence, product_crawler_mode, last_product_scan_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1`, [dispensaryId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function checkNeedsDetection(dispensary) {
|
||||
// No provider = definitely needs detection
|
||||
if (!dispensary.product_provider)
|
||||
return true;
|
||||
// Unknown provider = needs detection
|
||||
if (dispensary.product_provider === 'unknown')
|
||||
return true;
|
||||
// Low confidence = needs re-detection
|
||||
if (dispensary.product_confidence !== null && dispensary.product_confidence < 50)
|
||||
return true;
|
||||
// Stale detection (> 7 days) = needs refresh
|
||||
if (dispensary.last_product_scan_at) {
|
||||
const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||
if (daysSince > 7)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async function updateScheduleStatus(dispensaryId, status, summary, error, runId) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, NOW(), NOW())
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
last_status = $2,
|
||||
last_summary = $3,
|
||||
last_error = $4,
|
||||
last_run_at = NOW(),
|
||||
updated_at = NOW()`, [dispensaryId, status, summary, error]);
|
||||
}
|
||||
async function createJobRecord(dispensaryId, scheduleId, result) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs (
|
||||
dispensary_id, schedule_id, job_type, trigger_type, status, priority,
|
||||
scheduled_at, started_at, completed_at, duration_ms,
|
||||
detection_ran, crawl_ran, crawl_type,
|
||||
products_found, products_new, products_updated,
|
||||
detected_provider, detected_confidence, detected_mode,
|
||||
error_message, run_id
|
||||
) VALUES (
|
||||
$1, $2, 'orchestrator', 'manual', $3, 100,
|
||||
NOW(), NOW(), NOW(), $4,
|
||||
$5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13,
|
||||
$14, $15
|
||||
)`, [
|
||||
dispensaryId,
|
||||
scheduleId || null,
|
||||
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||
result.durationMs,
|
||||
result.detectionRan,
|
||||
result.crawlRan,
|
||||
result.crawlType || null,
|
||||
result.productsFound || null,
|
||||
result.productsNew || null,
|
||||
result.productsUpdated || null,
|
||||
result.detectionResult?.product.provider || null,
|
||||
result.detectionResult?.product.confidence || null,
|
||||
result.detectionResult?.product.mode || null,
|
||||
result.error || null,
|
||||
result.runId,
|
||||
]);
|
||||
// Update schedule stats
|
||||
if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
successful_runs = COALESCE(successful_runs, 0) + 1,
|
||||
consecutive_failures = 0,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
else if (result.status === 'error') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Batch Processing
|
||||
// ========================================
|
||||
/**
|
||||
* Run orchestrator for multiple dispensaries
|
||||
*/
|
||||
async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) {
|
||||
const results = [];
|
||||
// Process in batches
|
||||
for (let i = 0; i < dispensaryIds.length; i += concurrency) {
|
||||
const batch = dispensaryIds.slice(i, i + concurrency);
|
||||
console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`);
|
||||
const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id)));
|
||||
results.push(...batchResults);
|
||||
// Small delay between batches to avoid overwhelming the system
|
||||
if (i + concurrency < dispensaryIds.length) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
/**
|
||||
* Get dispensaries that are due for orchestration
|
||||
*/
|
||||
async function getDispensariesDueForOrchestration(limit = 10) {
|
||||
const result = await migrate_1.pool.query(`SELECT d.id
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE COALESCE(dcs.is_active, TRUE) = TRUE
|
||||
AND (
|
||||
dcs.next_run_at IS NULL
|
||||
OR dcs.next_run_at <= NOW()
|
||||
)
|
||||
AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending'))
|
||||
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
|
||||
LIMIT $1`, [limit]);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
/**
|
||||
* Ensure all dispensaries have schedule entries
|
||||
*/
|
||||
async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) {
|
||||
// Get all dispensary IDs that don't have a schedule
|
||||
const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
SELECT d.id, TRUE, $1, 0
|
||||
FROM dispensaries d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||||
)
|
||||
RETURNING id`, [intervalMinutes]);
|
||||
const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule');
|
||||
return {
|
||||
created: result.rowCount || 0,
|
||||
existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0),
|
||||
};
|
||||
}
|
||||
// ========================================
|
||||
// Scheduler Integration
|
||||
// ========================================
|
||||
let dispensarySchedulerRunning = false;
|
||||
/**
|
||||
* Process dispensaries using the intelligent orchestrator
|
||||
* Called periodically by the scheduler
|
||||
*/
|
||||
async function processDispensaryScheduler() {
|
||||
if (dispensarySchedulerRunning) {
|
||||
console.log('Dispensary scheduler already running, skipping...');
|
||||
return;
|
||||
}
|
||||
dispensarySchedulerRunning = true;
|
||||
try {
|
||||
// Get dispensaries due for orchestration
|
||||
const dispensaryIds = await getDispensariesDueForOrchestration(3);
|
||||
if (dispensaryIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`);
|
||||
// Process each dispensary through the orchestrator
|
||||
for (const dispensaryId of dispensaryIds) {
|
||||
try {
|
||||
console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`);
|
||||
const result = await runDispensaryOrchestrator(dispensaryId);
|
||||
console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`);
|
||||
}
|
||||
finally {
|
||||
dispensarySchedulerRunning = false;
|
||||
}
|
||||
}
|
||||
125
backend/dist/services/geolocation.js
vendored
Normal file
125
backend/dist/services/geolocation.js
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.lookupProxyLocation = lookupProxyLocation;
|
||||
exports.updateProxyLocation = updateProxyLocation;
|
||||
exports.updateAllProxyLocations = updateAllProxyLocations;
|
||||
exports.queueProxyLocationUpdate = queueProxyLocationUpdate;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
// Free API - 45 requests/minute limit
|
||||
const GEOLOCATION_API = 'http://ip-api.com/json/';
|
||||
async function lookupProxyLocation(host) {
|
||||
try {
|
||||
const response = await axios_1.default.get(`${GEOLOCATION_API}${host}?fields=status,message,country,countryCode,regionName,city,query`);
|
||||
const data = response.data;
|
||||
if (data.status === 'fail') {
|
||||
console.log(`❌ Geolocation lookup failed for ${host}: ${data.message}`);
|
||||
return null;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`❌ Error looking up location for ${host}:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
async function updateProxyLocation(proxyId, location) {
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxies
|
||||
SET city = $1,
|
||||
state = $2,
|
||||
country = $3,
|
||||
country_code = $4,
|
||||
location_updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $5
|
||||
`, [
|
||||
location.city,
|
||||
location.regionName,
|
||||
location.country,
|
||||
location.countryCode,
|
||||
proxyId
|
||||
]);
|
||||
}
|
||||
async function updateAllProxyLocations(batchSize = 45) {
|
||||
console.log('🌍 Starting proxy location update job...');
|
||||
// Get all proxies without location data
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host
|
||||
FROM proxies
|
||||
WHERE location_updated_at IS NULL
|
||||
OR location_updated_at < CURRENT_TIMESTAMP - INTERVAL '30 days'
|
||||
ORDER BY id
|
||||
`);
|
||||
const proxies = result.rows;
|
||||
console.log(`📊 Found ${proxies.length} proxies to update`);
|
||||
let updated = 0;
|
||||
let failed = 0;
|
||||
// Process in batches to respect rate limit (45 req/min)
|
||||
for (let i = 0; i < proxies.length; i += batchSize) {
|
||||
const batch = proxies.slice(i, i + batchSize);
|
||||
console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(proxies.length / batchSize)} (${batch.length} proxies)`);
|
||||
// Process batch
|
||||
for (const proxy of batch) {
|
||||
const location = await lookupProxyLocation(proxy.host);
|
||||
if (location) {
|
||||
await updateProxyLocation(proxy.id, location);
|
||||
console.log(`✅ Updated ${proxy.id}: ${location.city}, ${location.regionName} - ${location.country}`);
|
||||
updated++;
|
||||
}
|
||||
else {
|
||||
console.log(`⚠️ Failed to get location for proxy ${proxy.id} (${proxy.host})`);
|
||||
failed++;
|
||||
}
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
// Wait 60 seconds before next batch to respect rate limit
|
||||
if (i + batchSize < proxies.length) {
|
||||
console.log(`⏳ Waiting 60s before next batch (rate limit: 45 req/min)...`);
|
||||
await new Promise(resolve => setTimeout(resolve, 60000));
|
||||
}
|
||||
}
|
||||
console.log(`✅ Proxy location update complete!`);
|
||||
console.log(` Updated: ${updated}`);
|
||||
console.log(` Failed: ${failed}`);
|
||||
}
|
||||
// Queue for background processing
|
||||
const locationUpdateQueue = new Set();
|
||||
let isProcessing = false;
|
||||
function queueProxyLocationUpdate(proxyId) {
|
||||
locationUpdateQueue.add(proxyId);
|
||||
processLocationQueue();
|
||||
}
|
||||
async function processLocationQueue() {
|
||||
if (isProcessing || locationUpdateQueue.size === 0)
|
||||
return;
|
||||
isProcessing = true;
|
||||
try {
|
||||
const proxyIds = Array.from(locationUpdateQueue);
|
||||
locationUpdateQueue.clear();
|
||||
console.log(`🌍 Processing ${proxyIds.length} proxy location updates from queue`);
|
||||
for (const proxyId of proxyIds) {
|
||||
const result = await migrate_1.pool.query('SELECT host FROM proxies WHERE id = $1', [proxyId]);
|
||||
if (result.rows.length === 0)
|
||||
continue;
|
||||
const host = result.rows[0].host;
|
||||
const location = await lookupProxyLocation(host);
|
||||
if (location) {
|
||||
await updateProxyLocation(proxyId, location);
|
||||
console.log(`✅ Queue: Updated ${proxyId}: ${location.city}, ${location.regionName} - ${location.country}`);
|
||||
}
|
||||
// Respect rate limit
|
||||
await new Promise(resolve => setTimeout(resolve, 1500)); // ~40 req/min
|
||||
}
|
||||
}
|
||||
finally {
|
||||
isProcessing = false;
|
||||
// Process any new items that were added while we were processing
|
||||
if (locationUpdateQueue.size > 0) {
|
||||
processLocationQueue();
|
||||
}
|
||||
}
|
||||
}
|
||||
493
backend/dist/services/intelligence-detector.js
vendored
Normal file
493
backend/dist/services/intelligence-detector.js
vendored
Normal file
@@ -0,0 +1,493 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Multi-Category Intelligence Detector
|
||||
*
|
||||
* Detects providers for each intelligence category independently:
|
||||
* - Products: Which provider serves product data
|
||||
* - Specials: Which provider serves deals/specials
|
||||
* - Brand: Which provider serves brand information
|
||||
* - Metadata: Which provider serves taxonomy/category data
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.detectMultiCategoryProviders = detectMultiCategoryProviders;
|
||||
exports.detectCategoryProviderChange = detectCategoryProviderChange;
|
||||
exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider;
|
||||
exports.updateAllCategoryProviders = updateAllCategoryProviders;
|
||||
exports.moveCategoryToSandbox = moveCategoryToSandbox;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
// Production-ready providers per category
|
||||
// Only these combinations can be set to production mode
|
||||
const PRODUCTION_READY = {
|
||||
product: ['dutchie'], // Only Dutchie products are production-ready
|
||||
specials: [], // None yet
|
||||
brand: [], // None yet
|
||||
metadata: [], // None yet
|
||||
};
|
||||
// Provider detection patterns
|
||||
const PROVIDER_PATTERNS = {
|
||||
dutchie: {
|
||||
scripts: [
|
||||
/dutchie\.com/i,
|
||||
/dutchie-plus/i,
|
||||
/dutchie\.js/i,
|
||||
/__DUTCHIE__/i,
|
||||
/dutchie-embed/i,
|
||||
],
|
||||
iframes: [
|
||||
/dutchie\.com/i,
|
||||
/dutchie-plus\.com/i,
|
||||
/embed\.dutchie/i,
|
||||
],
|
||||
html: [
|
||||
/class="dutchie/i,
|
||||
/id="dutchie/i,
|
||||
/data-dutchie/i,
|
||||
/"menuType":\s*"dutchie"/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/dutchie\.com\/graphql/i,
|
||||
/plus\.dutchie\.com/i,
|
||||
],
|
||||
metaTags: [
|
||||
/dutchie/i,
|
||||
],
|
||||
},
|
||||
treez: {
|
||||
scripts: [
|
||||
/treez\.io/i,
|
||||
/treez-ecommerce/i,
|
||||
/treez\.js/i,
|
||||
],
|
||||
iframes: [
|
||||
/treez\.io/i,
|
||||
/shop\.treez/i,
|
||||
],
|
||||
html: [
|
||||
/class="treez/i,
|
||||
/data-treez/i,
|
||||
/treez-menu/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.treez\.io/i,
|
||||
/treez\.io\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
jane: {
|
||||
scripts: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/jane-frame/i,
|
||||
/jane\.js/i,
|
||||
],
|
||||
iframes: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/embed\.iheartjane/i,
|
||||
],
|
||||
html: [
|
||||
/class="jane/i,
|
||||
/data-jane/i,
|
||||
/jane-embed/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.iheartjane/i,
|
||||
/jane\.co\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
weedmaps: {
|
||||
scripts: [
|
||||
/weedmaps\.com/i,
|
||||
/wm-menu/i,
|
||||
],
|
||||
iframes: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
html: [
|
||||
/data-weedmaps/i,
|
||||
/wm-menu/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api-g\.weedmaps/i,
|
||||
/weedmaps\.com\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
leafly: {
|
||||
scripts: [
|
||||
/leafly\.com/i,
|
||||
/leafly-menu/i,
|
||||
],
|
||||
iframes: [
|
||||
/leafly\.com/i,
|
||||
/order\.leafly/i,
|
||||
],
|
||||
html: [
|
||||
/data-leafly/i,
|
||||
/leafly-embed/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.leafly/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
};
|
||||
// Category-specific detection signals
|
||||
const CATEGORY_SIGNALS = {
|
||||
product: {
|
||||
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
|
||||
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
|
||||
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
|
||||
},
|
||||
specials: {
|
||||
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
|
||||
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
|
||||
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
|
||||
},
|
||||
brand: {
|
||||
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
|
||||
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
|
||||
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
|
||||
},
|
||||
metadata: {
|
||||
urlPatterns: [/\/categories/i, /\/taxonomy/i],
|
||||
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
|
||||
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
|
||||
},
|
||||
};
|
||||
// ========================================
|
||||
// Main Detection Function
|
||||
// ========================================
|
||||
async function detectMultiCategoryProviders(websiteUrl, options = {}) {
|
||||
const { timeout = 30000, headless = true, existingBrowser } = options;
|
||||
let browser = null;
|
||||
let page = null;
|
||||
const urlsTested = [];
|
||||
const rawSignals = {};
|
||||
try {
|
||||
browser = existingBrowser || await puppeteer_1.default.launch({
|
||||
headless,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
||||
// Navigate to main site
|
||||
const baseUrl = normalizeUrl(websiteUrl);
|
||||
urlsTested.push(baseUrl);
|
||||
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
|
||||
// Collect signals from main page
|
||||
const mainPageSignals = await collectPageSignals(page);
|
||||
rawSignals.mainPage = mainPageSignals;
|
||||
// Try common menu URLs
|
||||
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
|
||||
for (const path of menuUrls) {
|
||||
try {
|
||||
const fullUrl = new URL(path, baseUrl).toString();
|
||||
urlsTested.push(fullUrl);
|
||||
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
|
||||
const signals = await collectPageSignals(page);
|
||||
rawSignals[path] = signals;
|
||||
}
|
||||
catch {
|
||||
// URL doesn't exist or timed out
|
||||
}
|
||||
}
|
||||
// Analyze signals for each category
|
||||
const result = {
|
||||
product: analyzeCategorySignals('product', rawSignals),
|
||||
specials: analyzeCategorySignals('specials', rawSignals),
|
||||
brand: analyzeCategorySignals('brand', rawSignals),
|
||||
metadata: analyzeCategorySignals('metadata', rawSignals),
|
||||
urlsTested,
|
||||
rawSignals,
|
||||
};
|
||||
logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
|
||||
return result;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||
// Return unknown results for all categories
|
||||
return {
|
||||
product: createUnknownResult(),
|
||||
specials: createUnknownResult(),
|
||||
brand: createUnknownResult(),
|
||||
metadata: createUnknownResult(),
|
||||
urlsTested,
|
||||
rawSignals: { error: error.message },
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (page)
|
||||
await page.close().catch(() => { });
|
||||
if (browser && !existingBrowser)
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
function normalizeUrl(url) {
|
||||
if (!url.startsWith('http')) {
|
||||
url = 'https://' + url;
|
||||
}
|
||||
return url.replace(/\/$/, '');
|
||||
}
|
||||
async function collectPageSignals(page) {
|
||||
return page.evaluate(() => {
|
||||
const signals = {
|
||||
scripts: [],
|
||||
iframes: [],
|
||||
links: [],
|
||||
metaTags: [],
|
||||
bodyClasses: document.body?.className || '',
|
||||
bodyId: document.body?.id || '',
|
||||
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
|
||||
};
|
||||
// Collect script sources
|
||||
document.querySelectorAll('script[src]').forEach((el) => {
|
||||
signals.scripts.push(el.src);
|
||||
});
|
||||
// Collect inline scripts
|
||||
document.querySelectorAll('script:not([src])').forEach((el) => {
|
||||
const content = el.textContent || '';
|
||||
if (content.length < 5000) {
|
||||
signals.scripts.push(`inline:${content.slice(0, 500)}`);
|
||||
}
|
||||
});
|
||||
// Collect iframes
|
||||
document.querySelectorAll('iframe').forEach((el) => {
|
||||
signals.iframes.push(el.src);
|
||||
});
|
||||
// Collect links
|
||||
document.querySelectorAll('a[href]').forEach((el) => {
|
||||
signals.links.push(el.href);
|
||||
});
|
||||
// Collect meta tags
|
||||
document.querySelectorAll('meta').forEach((el) => {
|
||||
const content = el.getAttribute('content') || '';
|
||||
const name = el.getAttribute('name') || el.getAttribute('property') || '';
|
||||
if (content || name) {
|
||||
signals.metaTags.push(`${name}:${content}`);
|
||||
}
|
||||
});
|
||||
// Look for JSON data
|
||||
const jsonBlocks = [];
|
||||
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
|
||||
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
|
||||
});
|
||||
signals.jsonBlocks = jsonBlocks;
|
||||
return signals;
|
||||
});
|
||||
}
|
||||
function analyzeCategorySignals(category, allSignals) {
|
||||
const providerScores = {};
|
||||
const detectedSignals = {};
|
||||
// Initialize scores
|
||||
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
|
||||
providerScores[provider] = 0;
|
||||
}
|
||||
// Analyze each page's signals
|
||||
for (const [pagePath, signals] of Object.entries(allSignals)) {
|
||||
if (!signals || typeof signals !== 'object')
|
||||
continue;
|
||||
// Check for provider-specific patterns
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
let score = 0;
|
||||
// Check scripts
|
||||
if (signals.scripts) {
|
||||
for (const script of signals.scripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(script)) {
|
||||
score += 20;
|
||||
detectedSignals[`${provider}_script_${pagePath}`] = script;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
if (signals.iframes) {
|
||||
for (const iframe of signals.iframes) {
|
||||
for (const pattern of patterns.iframes) {
|
||||
if (pattern.test(iframe)) {
|
||||
score += 25;
|
||||
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check HTML content
|
||||
if (signals.htmlSnippet) {
|
||||
for (const pattern of patterns.html) {
|
||||
if (pattern.test(signals.htmlSnippet)) {
|
||||
score += 15;
|
||||
detectedSignals[`${provider}_html_${pagePath}`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
providerScores[provider] += score;
|
||||
}
|
||||
// Check for category-specific signals on relevant pages
|
||||
const categorySignals = CATEGORY_SIGNALS[category];
|
||||
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
|
||||
if (isRelevantPage && signals.htmlSnippet) {
|
||||
for (const pattern of categorySignals.htmlPatterns) {
|
||||
if (pattern.test(signals.htmlSnippet)) {
|
||||
detectedSignals[`${category}_html_pattern`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check JSON blocks for category data
|
||||
if (signals.jsonBlocks) {
|
||||
for (const json of signals.jsonBlocks) {
|
||||
for (const key of categorySignals.jsonKeys) {
|
||||
if (json.toLowerCase().includes(`"${key}"`)) {
|
||||
detectedSignals[`${category}_json_key_${key}`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Determine winning provider
|
||||
let bestProvider = 'unknown';
|
||||
let bestScore = 0;
|
||||
for (const [provider, score] of Object.entries(providerScores)) {
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestProvider = provider;
|
||||
}
|
||||
}
|
||||
// Calculate confidence (0-100)
|
||||
const confidence = Math.min(100, bestScore);
|
||||
// Determine mode based on provider and confidence
|
||||
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
|
||||
const mode = isProductionReady && confidence >= 70
|
||||
? 'production'
|
||||
: 'sandbox';
|
||||
// Get template name if available
|
||||
let templateName;
|
||||
if (bestProvider === 'dutchie' && category === 'product') {
|
||||
templateName = 'dutchie_standard';
|
||||
}
|
||||
else if (bestProvider === 'treez') {
|
||||
templateName = 'treez_products_v0';
|
||||
}
|
||||
return {
|
||||
provider: bestProvider,
|
||||
confidence,
|
||||
mode,
|
||||
signals: detectedSignals,
|
||||
templateName,
|
||||
};
|
||||
}
|
||||
function createUnknownResult() {
|
||||
return {
|
||||
provider: 'unknown',
|
||||
confidence: 0,
|
||||
mode: 'sandbox',
|
||||
signals: {},
|
||||
};
|
||||
}
|
||||
// ========================================
|
||||
// Lightweight Per-Category Change Detection
|
||||
// ========================================
|
||||
async function detectCategoryProviderChange(page, category, expectedProvider) {
|
||||
try {
|
||||
const signals = await collectPageSignals(page);
|
||||
const result = analyzeCategorySignals(category, { currentPage: signals });
|
||||
if (result.provider !== expectedProvider && result.confidence > 50) {
|
||||
logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`);
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: result.provider,
|
||||
confidence: result.confidence,
|
||||
};
|
||||
}
|
||||
return { changed: false };
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`);
|
||||
return { changed: false };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Database Operations
|
||||
// ========================================
|
||||
async function updateDispensaryCategoryProvider(dispensaryId, category, result) {
|
||||
const columnPrefix = category === 'product' ? 'product' :
|
||||
category === 'specials' ? 'specials' :
|
||||
category === 'brand' ? 'brand' : 'metadata';
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
${columnPrefix}_provider = $1,
|
||||
${columnPrefix}_confidence = $2,
|
||||
${columnPrefix}_crawler_mode = $3,
|
||||
${columnPrefix}_detection_data = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $5`, [
|
||||
result.provider,
|
||||
result.confidence,
|
||||
result.mode,
|
||||
JSON.stringify(result.signals),
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
async function updateAllCategoryProviders(dispensaryId, result) {
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
product_provider = $1,
|
||||
product_confidence = $2,
|
||||
product_crawler_mode = $3,
|
||||
product_detection_data = $4,
|
||||
specials_provider = $5,
|
||||
specials_confidence = $6,
|
||||
specials_crawler_mode = $7,
|
||||
specials_detection_data = $8,
|
||||
brand_provider = $9,
|
||||
brand_confidence = $10,
|
||||
brand_crawler_mode = $11,
|
||||
brand_detection_data = $12,
|
||||
metadata_provider = $13,
|
||||
metadata_confidence = $14,
|
||||
metadata_crawler_mode = $15,
|
||||
metadata_detection_data = $16,
|
||||
updated_at = NOW()
|
||||
WHERE id = $17`, [
|
||||
result.product.provider,
|
||||
result.product.confidence,
|
||||
result.product.mode,
|
||||
JSON.stringify(result.product.signals),
|
||||
result.specials.provider,
|
||||
result.specials.confidence,
|
||||
result.specials.mode,
|
||||
JSON.stringify(result.specials.signals),
|
||||
result.brand.provider,
|
||||
result.brand.confidence,
|
||||
result.brand.mode,
|
||||
JSON.stringify(result.brand.signals),
|
||||
result.metadata.provider,
|
||||
result.metadata.confidence,
|
||||
result.metadata.mode,
|
||||
JSON.stringify(result.metadata.signals),
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
async function moveCategoryToSandbox(dispensaryId, category, reason) {
|
||||
const columnPrefix = category === 'product' ? 'product' :
|
||||
category === 'specials' ? 'specials' :
|
||||
category === 'brand' ? 'brand' : 'metadata';
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
${columnPrefix}_crawler_mode = 'sandbox',
|
||||
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2`, [
|
||||
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
|
||||
dispensaryId,
|
||||
]);
|
||||
logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
|
||||
}
|
||||
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
@@ -0,0 +1,612 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Menu Provider Detection Service
|
||||
*
|
||||
* Detects which menu platform a dispensary is using by analyzing:
|
||||
* - HTML content patterns (scripts, iframes, classes)
|
||||
* - URL patterns (embedded menu paths)
|
||||
* - API endpoint signatures
|
||||
* - Meta tags and headers
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.detectMenuProvider = detectMenuProvider;
|
||||
exports.quickDutchieCheck = quickDutchieCheck;
|
||||
exports.detectProviderChange = detectProviderChange;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const logger_1 = require("./logger");
|
||||
// Provider detection patterns
|
||||
const PROVIDER_PATTERNS = {
|
||||
dutchie: {
|
||||
scripts: [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/dutchie\.com/i,
|
||||
/dutchie-embed/i,
|
||||
],
|
||||
iframes: [
|
||||
/dutchie\.com/i,
|
||||
/embed\.dutchie/i,
|
||||
/iframe\.dutchie/i,
|
||||
],
|
||||
classes: [
|
||||
/dutchie-/i,
|
||||
/DutchieEmbed/i,
|
||||
],
|
||||
urls: [
|
||||
/dutchie\.com/i,
|
||||
/\.dutchie\./i,
|
||||
],
|
||||
meta: [
|
||||
/dutchie/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/graphql.*dutchie/i,
|
||||
/api\.dutchie/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-dutchie/i,
|
||||
/__DUTCHIE__/i,
|
||||
/dutchie-plus-iframe/i,
|
||||
],
|
||||
},
|
||||
treez: {
|
||||
scripts: [
|
||||
/treez/i,
|
||||
/treez\.io/i,
|
||||
/treezpay/i,
|
||||
],
|
||||
iframes: [
|
||||
/treez\.io/i,
|
||||
/menu\.treez/i,
|
||||
],
|
||||
classes: [
|
||||
/treez-/i,
|
||||
],
|
||||
urls: [
|
||||
/treez\.io/i,
|
||||
/\.treez\./i,
|
||||
],
|
||||
meta: [
|
||||
/treez/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.treez/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-treez/i,
|
||||
/treez-embed/i,
|
||||
],
|
||||
},
|
||||
jane: {
|
||||
scripts: [
|
||||
/jane\.co/i,
|
||||
/iheartjane/i,
|
||||
/jane-embed/i,
|
||||
/janetechnologies/i,
|
||||
],
|
||||
iframes: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/menu\.jane/i,
|
||||
],
|
||||
classes: [
|
||||
/jane-/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
urls: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/jane/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.iheartjane/i,
|
||||
/api\.jane\.co/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-jane/i,
|
||||
/jane-root/i,
|
||||
/jane-embed/i,
|
||||
],
|
||||
},
|
||||
weedmaps: {
|
||||
scripts: [
|
||||
/weedmaps/i,
|
||||
/wm\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
classes: [
|
||||
/weedmaps-/i,
|
||||
/wm-/i,
|
||||
],
|
||||
urls: [
|
||||
/weedmaps\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/weedmaps/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api.*weedmaps/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-weedmaps/i,
|
||||
],
|
||||
},
|
||||
leafly: {
|
||||
scripts: [
|
||||
/leafly/i,
|
||||
/leafly\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/leafly\.com/i,
|
||||
/menu\.leafly/i,
|
||||
],
|
||||
classes: [
|
||||
/leafly-/i,
|
||||
],
|
||||
urls: [
|
||||
/leafly\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/leafly/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.leafly/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-leafly/i,
|
||||
],
|
||||
},
|
||||
meadow: {
|
||||
scripts: [
|
||||
/meadow/i,
|
||||
/getmeadow/i,
|
||||
],
|
||||
iframes: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/meadow-/i,
|
||||
],
|
||||
urls: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.getmeadow/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
greenlight: {
|
||||
scripts: [
|
||||
/greenlight/i,
|
||||
/greenlightmenu/i,
|
||||
],
|
||||
iframes: [
|
||||
/greenlight/i,
|
||||
],
|
||||
classes: [
|
||||
/greenlight-/i,
|
||||
],
|
||||
urls: [
|
||||
/greenlight/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
blaze: {
|
||||
scripts: [
|
||||
/blaze\.me/i,
|
||||
/blazepos/i,
|
||||
],
|
||||
iframes: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
classes: [
|
||||
/blaze-/i,
|
||||
],
|
||||
urls: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.blaze/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
flowhub: {
|
||||
scripts: [
|
||||
/flowhub/i,
|
||||
],
|
||||
iframes: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/flowhub-/i,
|
||||
],
|
||||
urls: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
dispense: {
|
||||
scripts: [
|
||||
/dispenseapp/i,
|
||||
],
|
||||
iframes: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/dispense-/i,
|
||||
],
|
||||
urls: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
cova: {
|
||||
scripts: [
|
||||
/covasoftware/i,
|
||||
/cova\.software/i,
|
||||
],
|
||||
iframes: [
|
||||
/cova/i,
|
||||
],
|
||||
classes: [
|
||||
/cova-/i,
|
||||
],
|
||||
urls: [
|
||||
/cova/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
};
|
||||
// Common menu URL paths to check
|
||||
const MENU_PATHS = [
|
||||
'/menu',
|
||||
'/shop',
|
||||
'/products',
|
||||
'/order',
|
||||
'/store',
|
||||
'/dispensary-menu',
|
||||
'/online-menu',
|
||||
'/shop-all',
|
||||
'/browse',
|
||||
'/catalog',
|
||||
];
|
||||
/**
|
||||
* Analyze a single page for provider signals
|
||||
*/
|
||||
async function analyzePageForProviders(page, url) {
|
||||
const signals = [];
|
||||
try {
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
const lowerHtml = html.toLowerCase();
|
||||
// Check each provider's patterns
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
// Check script sources
|
||||
const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const script of scripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(script)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 90,
|
||||
source: 'script_src',
|
||||
details: script,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check inline scripts
|
||||
const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
|
||||
for (const scriptContent of inlineScripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(scriptContent)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 70,
|
||||
source: 'inline_script',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
for (const pattern of patterns.iframes) {
|
||||
if (pattern.test(iframe)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'iframe_src',
|
||||
details: iframe,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check HTML patterns
|
||||
for (const pattern of patterns.htmlPatterns) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 85,
|
||||
source: 'html_pattern',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check CSS classes
|
||||
for (const pattern of patterns.classes) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 60,
|
||||
source: 'css_class',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check meta tags
|
||||
const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
|
||||
for (const meta of metaTags) {
|
||||
for (const pattern of patterns.meta) {
|
||||
if (pattern.test(meta)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 80,
|
||||
source: 'meta_tag',
|
||||
details: meta,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check for network requests (if we intercepted them)
|
||||
// This would be enhanced with request interception
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
|
||||
}
|
||||
return signals;
|
||||
}
|
||||
/**
|
||||
* Aggregate signals into a final detection result
|
||||
*/
|
||||
function aggregateSignals(signals) {
|
||||
if (signals.length === 0) {
|
||||
return { provider: 'unknown', confidence: 0 };
|
||||
}
|
||||
// Group signals by provider
|
||||
const providerScores = {};
|
||||
for (const signal of signals) {
|
||||
if (!providerScores[signal.provider]) {
|
||||
providerScores[signal.provider] = [];
|
||||
}
|
||||
providerScores[signal.provider].push(signal.confidence);
|
||||
}
|
||||
// Calculate weighted score for each provider
|
||||
const scores = [];
|
||||
for (const [provider, confidences] of Object.entries(providerScores)) {
|
||||
// Use max confidence + bonus for multiple signals
|
||||
const maxConf = Math.max(...confidences);
|
||||
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
|
||||
const score = Math.min(100, maxConf + multiSignalBonus);
|
||||
scores.push({ provider: provider, score });
|
||||
}
|
||||
// Sort by score descending
|
||||
scores.sort((a, b) => b.score - a.score);
|
||||
const best = scores[0];
|
||||
// If there's a clear winner (20+ point lead), use it
|
||||
if (scores.length === 1 || best.score - scores[1].score >= 20) {
|
||||
return { provider: best.provider, confidence: best.score };
|
||||
}
|
||||
// Multiple contenders - reduce confidence
|
||||
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
|
||||
}
|
||||
/**
|
||||
* Detect the menu provider for a dispensary
|
||||
*/
|
||||
async function detectMenuProvider(websiteUrl, options = {}) {
|
||||
const { checkMenuPaths = true, timeout = 30000 } = options;
|
||||
const result = {
|
||||
provider: 'unknown',
|
||||
confidence: 0,
|
||||
signals: [],
|
||||
urlsTested: [],
|
||||
menuEntryPoints: [],
|
||||
rawSignals: {},
|
||||
};
|
||||
let browser = null;
|
||||
try {
|
||||
// Normalize URL
|
||||
let baseUrl = websiteUrl.trim();
|
||||
if (!baseUrl.startsWith('http')) {
|
||||
baseUrl = `https://${baseUrl}`;
|
||||
}
|
||||
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
|
||||
// Launch browser
|
||||
browser = await puppeteer_1.default.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Track network requests for API detection
|
||||
const apiRequests = [];
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
if (url.includes('api') || url.includes('graphql')) {
|
||||
apiRequests.push(url);
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
// URLs to check
|
||||
const urlsToCheck = [baseUrl];
|
||||
if (checkMenuPaths) {
|
||||
for (const path of MENU_PATHS) {
|
||||
urlsToCheck.push(`${baseUrl}${path}`);
|
||||
}
|
||||
}
|
||||
// Check each URL
|
||||
for (const url of urlsToCheck) {
|
||||
try {
|
||||
result.urlsTested.push(url);
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Wait a bit for dynamic content
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
// Analyze page
|
||||
const pageSignals = await analyzePageForProviders(page, url);
|
||||
result.signals.push(...pageSignals);
|
||||
// Track if this URL has menu content
|
||||
const hasMenuContent = await page.evaluate(() => {
|
||||
const text = document.body.innerText.toLowerCase();
|
||||
return (text.includes('add to cart') ||
|
||||
text.includes('add to bag') ||
|
||||
text.includes('product') ||
|
||||
text.includes('indica') ||
|
||||
text.includes('sativa') ||
|
||||
text.includes('hybrid') ||
|
||||
text.includes('thc') ||
|
||||
text.includes('cbd'));
|
||||
});
|
||||
if (hasMenuContent && url !== baseUrl) {
|
||||
result.menuEntryPoints.push(url);
|
||||
}
|
||||
}
|
||||
catch (pageError) {
|
||||
// 404s are fine, just skip
|
||||
if (!pageError.message?.includes('404')) {
|
||||
logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check API requests for provider hints
|
||||
for (const apiUrl of apiRequests) {
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
for (const pattern of patterns.apiEndpoints) {
|
||||
if (pattern.test(apiUrl)) {
|
||||
result.signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'api_request',
|
||||
details: apiUrl,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record raw signals
|
||||
result.rawSignals = {
|
||||
apiRequestsFound: apiRequests.length,
|
||||
menuEntryPointsFound: result.menuEntryPoints.length,
|
||||
totalSignals: result.signals.length,
|
||||
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
|
||||
};
|
||||
// Aggregate signals into final result
|
||||
const aggregated = aggregateSignals(result.signals);
|
||||
result.provider = aggregated.provider;
|
||||
result.confidence = aggregated.confidence;
|
||||
}
|
||||
catch (error) {
|
||||
result.error = error.message;
|
||||
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Quick check if a site has Dutchie - used during production crawls
|
||||
*/
|
||||
async function quickDutchieCheck(page) {
|
||||
try {
|
||||
const html = await page.content();
|
||||
// Check for Dutchie-specific patterns
|
||||
const dutchiePatterns = [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/__DUTCHIE__/i,
|
||||
/data-dutchie/i,
|
||||
/embed\.dutchie/i,
|
||||
];
|
||||
for (const pattern of dutchiePatterns) {
|
||||
if (pattern.test(html)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
if (/dutchie/i.test(iframe)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if provider has changed from expected
|
||||
*/
|
||||
async function detectProviderChange(page, expectedProvider) {
|
||||
try {
|
||||
const signals = await analyzePageForProviders(page, page.url());
|
||||
const aggregated = aggregateSignals(signals);
|
||||
// If we expected Dutchie but found something else with high confidence
|
||||
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider,
|
||||
confidence: aggregated.confidence,
|
||||
};
|
||||
}
|
||||
// If we expected Dutchie and found nothing/low confidence, might have switched
|
||||
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
|
||||
// Check if Dutchie is definitely NOT present
|
||||
const hasDutchie = await quickDutchieCheck(page);
|
||||
if (!hasDutchie) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
|
||||
confidence: Math.max(30, aggregated.confidence),
|
||||
};
|
||||
}
|
||||
}
|
||||
return { changed: false };
|
||||
}
|
||||
catch {
|
||||
return { changed: false };
|
||||
}
|
||||
}
|
||||
171
backend/dist/services/proxy.js
vendored
171
backend/dist/services/proxy.js
vendored
@@ -3,22 +3,92 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.isBotDetectionError = isBotDetectionError;
|
||||
exports.putProxyInTimeout = putProxyInTimeout;
|
||||
exports.isProxyInTimeout = isProxyInTimeout;
|
||||
exports.getActiveProxy = getActiveProxy;
|
||||
exports.testProxy = testProxy;
|
||||
exports.saveProxyTestResult = saveProxyTestResult;
|
||||
exports.testAllProxies = testAllProxies;
|
||||
exports.addProxy = addProxy;
|
||||
exports.addProxiesFromList = addProxiesFromList;
|
||||
exports.moveProxyToFailed = moveProxyToFailed;
|
||||
exports.incrementProxyFailure = incrementProxyFailure;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const socks_proxy_agent_1 = require("socks-proxy-agent");
|
||||
const https_proxy_agent_1 = require("https-proxy-agent");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
// In-memory proxy timeout tracking
|
||||
// Maps proxy ID to timestamp when timeout expires
|
||||
const proxyTimeouts = new Map();
|
||||
const PROXY_TIMEOUT_MS = 35000; // 35 seconds timeout for bot-detected proxies
|
||||
// Check if error message indicates bot detection
|
||||
function isBotDetectionError(errorMsg) {
|
||||
const botPatterns = [
|
||||
/bot detection/i,
|
||||
/captcha/i,
|
||||
/challenge/i,
|
||||
/cloudflare/i,
|
||||
/access denied/i,
|
||||
/rate limit/i,
|
||||
/too many requests/i,
|
||||
/temporarily blocked/i,
|
||||
/suspicious activity/i,
|
||||
];
|
||||
return botPatterns.some(pattern => pattern.test(errorMsg));
|
||||
}
|
||||
// Put proxy in timeout (bot detection cooldown)
|
||||
function putProxyInTimeout(proxyId, reason) {
|
||||
const timeoutUntil = Date.now() + PROXY_TIMEOUT_MS;
|
||||
proxyTimeouts.set(proxyId, timeoutUntil);
|
||||
console.log(`🚫 Proxy ${proxyId} in timeout for ${PROXY_TIMEOUT_MS / 1000}s: ${reason}`);
|
||||
}
|
||||
// Check if proxy is currently in timeout
|
||||
function isProxyInTimeout(proxyId) {
|
||||
const timeoutUntil = proxyTimeouts.get(proxyId);
|
||||
if (!timeoutUntil)
|
||||
return false;
|
||||
if (Date.now() >= timeoutUntil) {
|
||||
// Timeout expired, remove it
|
||||
proxyTimeouts.delete(proxyId);
|
||||
console.log(`✅ Proxy ${proxyId} timeout expired, back in rotation`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Get active proxy that's not in timeout
|
||||
async function getActiveProxy() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true
|
||||
ORDER BY RANDOM()
|
||||
`);
|
||||
// Filter out proxies in timeout
|
||||
for (const proxy of result.rows) {
|
||||
if (!isProxyInTimeout(proxy.id)) {
|
||||
return proxy;
|
||||
}
|
||||
}
|
||||
// All proxies are in timeout, wait for first one to expire
|
||||
if (proxyTimeouts.size > 0) {
|
||||
const nextAvailable = Math.min(...Array.from(proxyTimeouts.values()));
|
||||
const waitTime = Math.max(0, nextAvailable - Date.now());
|
||||
console.log(`⏳ All proxies in timeout, waiting ${Math.ceil(waitTime / 1000)}s for next available...`);
|
||||
await new Promise(resolve => setTimeout(resolve, waitTime));
|
||||
// Try again after waiting
|
||||
return getActiveProxy();
|
||||
}
|
||||
console.log('⚠️ No active proxies available');
|
||||
return null;
|
||||
}
|
||||
async function getSettings() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT key, value FROM settings
|
||||
WHERE key IN ('proxy_timeout_ms', 'proxy_test_url')
|
||||
`);
|
||||
const settings = {};
|
||||
result.rows.forEach(row => {
|
||||
result.rows.forEach((row) => {
|
||||
settings[row.key] = row.value;
|
||||
});
|
||||
return {
|
||||
@@ -146,12 +216,44 @@ async function addProxy(host, port, protocol, username, password) {
|
||||
async function addProxiesFromList(proxies) {
|
||||
let added = 0;
|
||||
let failed = 0;
|
||||
let duplicates = 0;
|
||||
const errors = [];
|
||||
console.log(`📥 Importing ${proxies.length} proxies without testing...`);
|
||||
for (const proxy of proxies) {
|
||||
try {
|
||||
await addProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
|
||||
added++;
|
||||
console.log(`✅ Added proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
// Insert without testing first
|
||||
await migrate_1.pool.query(`
|
||||
INSERT INTO proxies (host, port, protocol, username, password, active)
|
||||
VALUES ($1, $2, $3, $4, $5, false)
|
||||
ON CONFLICT (host, port, protocol) DO NOTHING
|
||||
`, [
|
||||
proxy.host,
|
||||
proxy.port,
|
||||
proxy.protocol,
|
||||
proxy.username,
|
||||
proxy.password
|
||||
]);
|
||||
// Check if it was actually inserted
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id FROM proxies
|
||||
WHERE host = $1 AND port = $2 AND protocol = $3
|
||||
`, [proxy.host, proxy.port, proxy.protocol]);
|
||||
if (result.rows.length > 0) {
|
||||
// Check if it was just inserted (no last_tested_at means new)
|
||||
const checkResult = await migrate_1.pool.query(`
|
||||
SELECT last_tested_at FROM proxies
|
||||
WHERE host = $1 AND port = $2 AND protocol = $3
|
||||
`, [proxy.host, proxy.port, proxy.protocol]);
|
||||
if (checkResult.rows[0].last_tested_at === null) {
|
||||
added++;
|
||||
if (added % 100 === 0) {
|
||||
console.log(`📥 Imported ${added} proxies...`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
duplicates++;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
failed++;
|
||||
@@ -159,8 +261,63 @@ async function addProxiesFromList(proxies) {
|
||||
errors.push(errorMsg);
|
||||
console.log(`❌ Failed to add proxy: ${errorMsg}`);
|
||||
}
|
||||
// Small delay between adds
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
return { added, failed, errors };
|
||||
console.log(`✅ Import complete: ${added} added, ${duplicates} duplicates, ${failed} failed`);
|
||||
return { added, failed, duplicates, errors };
|
||||
}
|
||||
async function moveProxyToFailed(proxyId, errorMsg) {
|
||||
// Get proxy details
|
||||
const proxyResult = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password, failure_count
|
||||
FROM proxies
|
||||
WHERE id = $1
|
||||
`, [proxyId]);
|
||||
if (proxyResult.rows.length === 0) {
|
||||
return;
|
||||
}
|
||||
const proxy = proxyResult.rows[0];
|
||||
// Insert into failed_proxies table
|
||||
await migrate_1.pool.query(`
|
||||
INSERT INTO failed_proxies (host, port, protocol, username, password, failure_count, last_error)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT (host, port, protocol)
|
||||
DO UPDATE SET
|
||||
failure_count = $6,
|
||||
last_error = $7,
|
||||
failed_at = CURRENT_TIMESTAMP
|
||||
`, [
|
||||
proxy.host,
|
||||
proxy.port,
|
||||
proxy.protocol,
|
||||
proxy.username,
|
||||
proxy.password,
|
||||
proxy.failure_count,
|
||||
errorMsg
|
||||
]);
|
||||
// Delete from active proxies
|
||||
await migrate_1.pool.query(`DELETE FROM proxies WHERE id = $1`, [proxyId]);
|
||||
console.log(`🔴 Moved proxy to failed: ${proxy.protocol}://${proxy.host}:${proxy.port} (${proxy.failure_count} failures)`);
|
||||
}
|
||||
async function incrementProxyFailure(proxyId, errorMsg) {
|
||||
// Increment failure count
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxies
|
||||
SET failure_count = failure_count + 1,
|
||||
active = false,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
RETURNING failure_count, host, port, protocol
|
||||
`, [proxyId]);
|
||||
if (result.rows.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const proxy = result.rows[0];
|
||||
const failureCount = proxy.failure_count;
|
||||
console.log(`⚠️ Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
// If failed 3 times, move to failed table
|
||||
if (failureCount >= 3) {
|
||||
await moveProxyToFailed(proxyId, errorMsg);
|
||||
return true; // Moved to failed
|
||||
}
|
||||
return false; // Still in active proxies
|
||||
}
|
||||
|
||||
174
backend/dist/services/proxyTestQueue.js
vendored
Normal file
174
backend/dist/services/proxyTestQueue.js
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.cleanupOrphanedJobs = cleanupOrphanedJobs;
|
||||
exports.createProxyTestJob = createProxyTestJob;
|
||||
exports.getProxyTestJob = getProxyTestJob;
|
||||
exports.getActiveProxyTestJob = getActiveProxyTestJob;
|
||||
exports.cancelProxyTestJob = cancelProxyTestJob;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("./proxy");
|
||||
// Simple in-memory queue - could be replaced with Bull/Bee-Queue for production
|
||||
const activeJobs = new Map();
|
||||
// Clean up orphaned jobs on server startup
|
||||
async function cleanupOrphanedJobs() {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'cancelled',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE status IN ('pending', 'running')
|
||||
RETURNING id
|
||||
`);
|
||||
if (result.rows.length > 0) {
|
||||
console.log(`🧹 Cleaned up ${result.rows.length} orphaned proxy test jobs`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error cleaning up orphaned jobs:', error);
|
||||
}
|
||||
}
|
||||
async function createProxyTestJob() {
|
||||
// Check for existing running jobs first
|
||||
const existingJob = await getActiveProxyTestJob();
|
||||
if (existingJob) {
|
||||
throw new Error('A proxy test job is already running. Please cancel it first.');
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as count FROM proxies
|
||||
`);
|
||||
const totalProxies = parseInt(result.rows[0].count);
|
||||
const jobResult = await migrate_1.pool.query(`
|
||||
INSERT INTO proxy_test_jobs (status, total_proxies)
|
||||
VALUES ('pending', $1)
|
||||
RETURNING id
|
||||
`, [totalProxies]);
|
||||
const jobId = jobResult.rows[0].id;
|
||||
// Start job in background
|
||||
runProxyTestJob(jobId).catch(err => {
|
||||
console.error(`❌ Proxy test job ${jobId} failed:`, err);
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
async function getProxyTestJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
|
||||
FROM proxy_test_jobs
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function getActiveProxyTestJob() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
|
||||
FROM proxy_test_jobs
|
||||
WHERE status IN ('pending', 'running')
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function cancelProxyTestJob(jobId) {
|
||||
// Try to cancel in-memory job first
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
if (jobControl) {
|
||||
jobControl.cancelled = true;
|
||||
}
|
||||
// Always update database to handle orphaned jobs
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'cancelled',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1 AND status IN ('pending', 'running')
|
||||
RETURNING id
|
||||
`, [jobId]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
async function runProxyTestJob(jobId) {
|
||||
// Register job as active
|
||||
activeJobs.set(jobId, { cancelled: false });
|
||||
try {
|
||||
// Update status to running
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'running',
|
||||
started_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
console.log(`🔍 Starting proxy test job ${jobId}...`);
|
||||
// Get all proxies
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host, port, protocol, username, password
|
||||
FROM proxies
|
||||
ORDER BY id
|
||||
`);
|
||||
let tested = 0;
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
for (const proxy of result.rows) {
|
||||
// Check if job was cancelled
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
if (jobControl?.cancelled) {
|
||||
console.log(`⏸️ Proxy test job ${jobId} cancelled`);
|
||||
break;
|
||||
}
|
||||
// Test the proxy
|
||||
const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
|
||||
// Save result
|
||||
await (0, proxy_1.saveProxyTestResult)(proxy.id, testResult);
|
||||
tested++;
|
||||
if (testResult.success) {
|
||||
passed++;
|
||||
}
|
||||
else {
|
||||
failed++;
|
||||
}
|
||||
// Update job progress
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET tested_proxies = $1,
|
||||
passed_proxies = $2,
|
||||
failed_proxies = $3,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $4
|
||||
`, [tested, passed, failed, jobId]);
|
||||
// Log progress every 10 proxies
|
||||
if (tested % 10 === 0) {
|
||||
console.log(`📊 Job ${jobId}: ${tested}/${result.rows.length} proxies tested (${passed} passed, ${failed} failed)`);
|
||||
}
|
||||
}
|
||||
// Mark job as completed
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
const finalStatus = jobControl?.cancelled ? 'cancelled' : 'completed';
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = $1,
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $2
|
||||
`, [finalStatus, jobId]);
|
||||
console.log(`✅ Proxy test job ${jobId} ${finalStatus}: ${tested} tested, ${passed} passed, ${failed} failed`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`❌ Proxy test job ${jobId} error:`, error);
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'failed',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
}
|
||||
finally {
|
||||
// Remove from active jobs
|
||||
activeJobs.delete(jobId);
|
||||
}
|
||||
}
|
||||
2
backend/dist/services/scheduler.js
vendored
2
backend/dist/services/scheduler.js
vendored
@@ -18,7 +18,7 @@ async function getSettings() {
|
||||
WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
|
||||
`);
|
||||
const settings = {};
|
||||
result.rows.forEach(row => {
|
||||
result.rows.forEach((row) => {
|
||||
settings[row.key] = row.value;
|
||||
});
|
||||
return {
|
||||
|
||||
7
backend/dist/services/scraper-debug.js
vendored
7
backend/dist/services/scraper-debug.js
vendored
@@ -4,10 +4,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.debugDutchiePage = debugDutchiePage;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const logger_1 = require("./logger");
|
||||
// Apply stealth plugin
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
async function debugDutchiePage(url) {
|
||||
const browser = await puppeteer_1.default.launch({
|
||||
const browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
});
|
||||
|
||||
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
@@ -0,0 +1,236 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
|
||||
exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
|
||||
const age_gate_playwright_1 = require("../utils/age-gate-playwright");
|
||||
const logger_1 = require("./logger");
|
||||
const stealthBrowser_1 = require("../utils/stealthBrowser");
|
||||
const dutchie_1 = require("../scrapers/templates/dutchie");
|
||||
/**
|
||||
* Scrapes a category page using Playwright with stealth mode to extract product information
|
||||
*/
|
||||
async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
|
||||
logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
|
||||
logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
|
||||
// Create stealth browser with optional proxy
|
||||
const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
|
||||
try {
|
||||
// Create stealth context with age gate cookies
|
||||
const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
|
||||
// Try to load saved session cookies
|
||||
const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
|
||||
await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
|
||||
const page = await context.newPage();
|
||||
// Navigate to category page
|
||||
logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
|
||||
await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
// Random delay to appear more human
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
// Check for Cloudflare challenge
|
||||
if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
|
||||
logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
|
||||
const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
|
||||
if (!passed) {
|
||||
logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Save successful session cookies
|
||||
await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
|
||||
}
|
||||
// Wait for page to be fully loaded
|
||||
await (0, stealthBrowser_1.waitForPageLoad)(page);
|
||||
// Simulate human behavior
|
||||
await (0, stealthBrowser_1.simulateHumanBehavior)(page);
|
||||
// Check for and bypass age gate
|
||||
const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
|
||||
if (!bypassed) {
|
||||
logger_1.logger.error('scraper', 'Failed to bypass age gate');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Wait for products to load with random delay
|
||||
logger_1.logger.info('scraper', 'Waiting for products to load...');
|
||||
await (0, stealthBrowser_1.randomDelay)(2000, 4000);
|
||||
// Scroll to load all products with human-like behavior
|
||||
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
||||
await scrollToBottomHuman(page);
|
||||
// Extract products
|
||||
logger_1.logger.info('scraper', 'Extracting products from page...');
|
||||
const products = await extractProducts(page, categoryUrl, categoryName);
|
||||
logger_1.logger.info('scraper', `Found ${products.length} products`);
|
||||
await browser.close();
|
||||
return products;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Error scraping category: ${error}`);
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Scrolls to the bottom of the page with human-like behavior
|
||||
*/
|
||||
async function scrollToBottomHuman(page) {
|
||||
let previousHeight = 0;
|
||||
let currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
let attempts = 0;
|
||||
const maxAttempts = 20;
|
||||
while (previousHeight < currentHeight && attempts < maxAttempts) {
|
||||
previousHeight = currentHeight;
|
||||
// Scroll down in chunks with randomized delays
|
||||
const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
|
||||
await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
|
||||
// Random pause like a human reading
|
||||
await (0, stealthBrowser_1.randomDelay)(500, 1500);
|
||||
// Check new height
|
||||
currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
attempts++;
|
||||
}
|
||||
// Final wait for any lazy-loaded content
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
}
|
||||
/**
|
||||
* Extracts product information from the page
|
||||
*/
|
||||
async function extractProducts(page, categoryUrl, categoryName) {
|
||||
let products = [];
|
||||
// Check if we have a template for this URL
|
||||
const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
|
||||
if (template) {
|
||||
logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
|
||||
try {
|
||||
const templateProducts = await template.extractProducts(page);
|
||||
// Add category to products from template
|
||||
products = templateProducts.map(p => ({
|
||||
...p,
|
||||
category: categoryName,
|
||||
}));
|
||||
logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
|
||||
return products;
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
|
||||
// Fall through to fallback methods
|
||||
}
|
||||
}
|
||||
// Fallback Method 1: Dutchie products (for Sol Flower, etc.)
|
||||
try {
|
||||
const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
|
||||
if (dutchieProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
|
||||
for (const productEl of dutchieProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
|
||||
const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
|
||||
const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
|
||||
// Parse price
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
brand: brand ? brand.trim() : undefined,
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
|
||||
}
|
||||
// Method 2: Curaleaf products
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
|
||||
if (curaleafProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
|
||||
for (const productEl of curaleafProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
|
||||
const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
// Method 3: Generic product cards
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
|
||||
logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
|
||||
for (const productEl of genericProducts) {
|
||||
try {
|
||||
const text = await productEl.textContent() || '';
|
||||
// Only consider elements that look like products
|
||||
if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
|
||||
const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// Skip this element
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
return products;
|
||||
}
|
||||
/**
|
||||
* Test function to scrape a single category
|
||||
*/
|
||||
async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
|
||||
console.log(`\n🎭 Testing Playwright Category Scraper\n`);
|
||||
console.log(`Category: ${categoryName}`);
|
||||
console.log(`URL: ${url}\n`);
|
||||
const products = await scrapeCategoryPlaywright(url, categoryName, state);
|
||||
console.log(`\n✅ Found ${products.length} products\n`);
|
||||
products.slice(0, 5).forEach((p, i) => {
|
||||
console.log(`${i + 1}. ${p.name}`);
|
||||
if (p.brand)
|
||||
console.log(` Brand: ${p.brand}`);
|
||||
if (p.price)
|
||||
console.log(` Price: $${p.price}`);
|
||||
console.log(` URL: ${p.product_url}`);
|
||||
console.log('');
|
||||
});
|
||||
return products;
|
||||
}
|
||||
258
backend/dist/services/scraper.js
vendored
258
backend/dist/services/scraper.js
vendored
@@ -3,20 +3,52 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0;
|
||||
exports.getUserAgent = getUserAgent;
|
||||
exports.scrapeCategory = scrapeCategory;
|
||||
exports.saveProducts = saveProducts;
|
||||
exports.scrapeStore = scrapeStore;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const minio_1 = require("../utils/minio");
|
||||
const logger_1 = require("./logger");
|
||||
const USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
];
|
||||
function getRandomUserAgent() {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
const scraper_monitor_1 = require("../routes/scraper-monitor");
|
||||
const proxy_1 = require("./proxy");
|
||||
const age_gate_1 = require("../utils/age-gate");
|
||||
const availability_1 = require("./availability");
|
||||
// Apply stealth plugin for antidetect/anti-fingerprinting
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
exports.USER_AGENTS = {
|
||||
'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
|
||||
'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
|
||||
'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
|
||||
};
|
||||
exports.USER_AGENT_GROUPS = {
|
||||
desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
|
||||
mobile: ['mobile-ios', 'mobile-android'],
|
||||
serp: ['googlebot', 'bingbot']
|
||||
};
|
||||
function getRandomUserAgentFromGroup(group) {
|
||||
const randomKey = group[Math.floor(Math.random() * group.length)];
|
||||
return exports.USER_AGENTS[randomKey];
|
||||
}
|
||||
function getUserAgent(key) {
|
||||
if (!key)
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
// Check if it's a group
|
||||
if (key === 'rotate-desktop')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
if (key === 'rotate-mobile')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile);
|
||||
if (key === 'rotate-serp')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp);
|
||||
// Otherwise treat as specific UA
|
||||
return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
}
|
||||
function extractImageIdFromUrl(url) {
|
||||
try {
|
||||
@@ -44,19 +76,6 @@ function sanitizeProductData(product) {
|
||||
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
|
||||
};
|
||||
}
|
||||
async function getActiveProxy() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true AND is_anonymous = true
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function makePageStealthy(page) {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
@@ -86,12 +105,11 @@ async function makePageStealthy(page) {
|
||||
});
|
||||
}
|
||||
async function scrapeProductDetails(page, productUrl, productName) {
|
||||
const maxRetries = 2;
|
||||
const maxRetries = 3;
|
||||
let lastError = null;
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
||||
await page.waitForTimeout(3000);
|
||||
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
const details = await page.evaluate(() => {
|
||||
const allText = document.body.textContent || '';
|
||||
let fullSizeImage = null;
|
||||
@@ -233,9 +251,7 @@ async function scrapeProductDetails(page, productUrl, productName) {
|
||||
catch (error) {
|
||||
lastError = error;
|
||||
logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
|
||||
if (attempt < maxRetries) {
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
// No delays - just retry immediately
|
||||
}
|
||||
}
|
||||
logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
|
||||
@@ -253,8 +269,10 @@ async function scrapeProductDetails(page, productUrl, productName) {
|
||||
weights: []
|
||||
};
|
||||
}
|
||||
async function scrapeCategory(storeId, categoryId) {
|
||||
async function scrapeCategory(storeId, categoryId, userAgent) {
|
||||
let browser = null;
|
||||
const scraperId = `cat-${categoryId}-${Date.now()}`;
|
||||
let proxyId = null;
|
||||
try {
|
||||
const categoryResult = await migrate_1.pool.query(`
|
||||
SELECT c.*, s.slug as store_slug, s.name as store_name
|
||||
@@ -267,7 +285,12 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
const category = categoryResult.rows[0];
|
||||
logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
|
||||
const proxy = await getActiveProxy();
|
||||
// Register scraper with monitoring system
|
||||
(0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name);
|
||||
const proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
}
|
||||
const launchOptions = {
|
||||
headless: 'new',
|
||||
args: [
|
||||
@@ -287,24 +310,51 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
browser = await puppeteer_1.default.launch(launchOptions);
|
||||
browser = await puppeteer_extra_1.default.launch(launchOptions);
|
||||
const page = await browser.newPage();
|
||||
await makePageStealthy(page);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent(getRandomUserAgent());
|
||||
// Use provided userAgent or random if not specified
|
||||
const ua = getUserAgent(userAgent);
|
||||
await page.setUserAgent(ua);
|
||||
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
||||
const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url);
|
||||
await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state);
|
||||
logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
|
||||
try {
|
||||
await page.goto(category.dutchie_url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000
|
||||
});
|
||||
await page.waitForTimeout(5000);
|
||||
// If age gate still appears, try to bypass it
|
||||
await (0, age_gate_1.bypassAgeGate)(page, state);
|
||||
// Wait for products to load
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => {
|
||||
logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...');
|
||||
});
|
||||
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
||||
await autoScroll(page);
|
||||
await page.waitForTimeout(3000);
|
||||
}
|
||||
catch (navError) {
|
||||
logger_1.logger.error('scraper', `Navigation error: ${navError}`);
|
||||
// Check if this is bot detection - put proxy in timeout instead of hard failure
|
||||
if (proxyId) {
|
||||
const errorMsg = String(navError);
|
||||
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
||||
// Bot detection! Put this proxy in timeout and get a new one
|
||||
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
||||
throw new Error(`Bot detection: ${errorMsg}`);
|
||||
}
|
||||
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
||||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
|
||||
// Regular proxy failure - increment failure count
|
||||
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
||||
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
||||
}
|
||||
}
|
||||
throw navError;
|
||||
}
|
||||
logger_1.logger.info('scraper', 'Extracting product list from page...');
|
||||
@@ -336,6 +386,21 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
|
||||
}
|
||||
}
|
||||
// Extract variant (weight/size) - look for common patterns
|
||||
let variant = null;
|
||||
const variantPatterns = [
|
||||
/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
|
||||
/(\d+\s*pack)/i, // Pack sizes
|
||||
/(\d+\s*ct)/i, // Count
|
||||
/(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
|
||||
];
|
||||
for (const pattern of variantPatterns) {
|
||||
const match = allText.match(pattern);
|
||||
if (match) {
|
||||
variant = match[1].trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
const linkEl = card.querySelector('a[href*="/product/"]');
|
||||
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
|
||||
if (href && href.startsWith('/')) {
|
||||
@@ -343,6 +408,7 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
items.push({
|
||||
name,
|
||||
variant,
|
||||
price,
|
||||
originalPrice,
|
||||
href: href || window.location.href
|
||||
@@ -358,10 +424,19 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
// Update initial stats
|
||||
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
||||
productsProcessed: 0,
|
||||
productsTotal: products.length
|
||||
});
|
||||
for (let i = 0; i < products.length; i++) {
|
||||
const product = products[i];
|
||||
try {
|
||||
logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
|
||||
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
||||
productsProcessed: i + 1,
|
||||
productsTotal: products.length
|
||||
}, `Processing: ${product.name}`);
|
||||
if (!product.href) {
|
||||
logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
|
||||
product.metadata = {};
|
||||
@@ -391,7 +466,7 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
|
||||
failCount++;
|
||||
}
|
||||
await page.waitForTimeout(1500);
|
||||
// No delays - scrape fast!
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
|
||||
@@ -411,11 +486,16 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
SET last_scraped_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [categoryId]);
|
||||
// Mark scraper as complete
|
||||
(0, scraper_monitor_1.completeScraper)(scraperId);
|
||||
const formattedProducts = products.map((p, index) => {
|
||||
const sanitized = sanitizeProductData(p);
|
||||
// Normalize availability from Dutchie product data
|
||||
const availability = (0, availability_1.normalizeAvailability)(p);
|
||||
return {
|
||||
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
|
||||
name: sanitized.name,
|
||||
variant: p.variant || null,
|
||||
description: sanitized.description,
|
||||
price: p.price,
|
||||
originalPrice: p.originalPrice,
|
||||
@@ -426,13 +506,34 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
weight: sanitized.weight,
|
||||
imageUrl: p.imageUrl,
|
||||
dutchieUrl: p.href,
|
||||
metadata: p.metadata || {}
|
||||
metadata: p.metadata || {},
|
||||
availabilityStatus: availability.status,
|
||||
availabilityRaw: availability.raw,
|
||||
stockQuantity: availability.quantity
|
||||
};
|
||||
});
|
||||
return formattedProducts;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
|
||||
// Smart proxy error handling
|
||||
if (proxyId) {
|
||||
const errorMsg = String(error);
|
||||
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
||||
// Bot detection! Put this proxy in timeout
|
||||
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
||||
}
|
||||
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
||||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
|
||||
errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
|
||||
// Regular proxy failure - increment failure count
|
||||
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
||||
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
||||
}
|
||||
}
|
||||
// Mark scraper as failed
|
||||
(0, scraper_monitor_1.completeScraper)(scraperId, String(error));
|
||||
if (browser) {
|
||||
try {
|
||||
await browser.close();
|
||||
@@ -466,51 +567,84 @@ async function saveProducts(storeId, categoryId, products) {
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
|
||||
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
|
||||
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET in_stock = false
|
||||
WHERE store_id = $1 AND category_id = $2
|
||||
SET in_stock = false,
|
||||
availability_status = 'out_of_stock',
|
||||
last_seen_out_of_stock_at = CASE
|
||||
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
|
||||
ELSE last_seen_out_of_stock_at
|
||||
END
|
||||
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
|
||||
`, [storeId, categoryId]);
|
||||
for (const product of products) {
|
||||
try {
|
||||
// Get availability from product (defaults to in_stock if product exists in scraped data)
|
||||
const availStatus = product.availabilityStatus || 'in_stock';
|
||||
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
|
||||
const stockQty = product.stockQuantity ?? null;
|
||||
const existingResult = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
SELECT id, image_url, local_image_path, availability_status
|
||||
FROM products
|
||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||
`, [storeId, product.name, categoryId]);
|
||||
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
|
||||
`, [storeId, product.name, categoryId, product.variant || null]);
|
||||
let localImagePath = null;
|
||||
let productId;
|
||||
if (existingResult.rows.length > 0) {
|
||||
productId = existingResult.rows[0].id;
|
||||
localImagePath = existingResult.rows[0].local_image_path;
|
||||
const prevStatus = existingResult.rows[0].availability_status;
|
||||
// Determine if we need to update last_seen_in_stock_at
|
||||
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
|
||||
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET name = $1, description = $2, price = $3,
|
||||
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
|
||||
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
|
||||
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $12
|
||||
SET name = $1, variant = $2, description = $3, price = $4,
|
||||
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
|
||||
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
|
||||
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP,
|
||||
availability_status = $14,
|
||||
availability_raw = $15,
|
||||
stock_quantity = $16,
|
||||
last_seen_in_stock_at = CASE
|
||||
WHEN $17 THEN CURRENT_TIMESTAMP
|
||||
ELSE last_seen_in_stock_at
|
||||
END
|
||||
WHERE id = $13
|
||||
`, [
|
||||
product.name, product.description, product.price,
|
||||
product.name, product.variant, product.description, product.price,
|
||||
product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||
JSON.stringify(product.metadata), productId
|
||||
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
|
||||
isNowInStock && wasOutOfStock
|
||||
]);
|
||||
}
|
||||
else {
|
||||
// Generate unique slug from product name + timestamp + random suffix
|
||||
const baseSlug = product.name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 150);
|
||||
const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
|
||||
const slug = `${baseSlug}-${uniqueSuffix}`;
|
||||
const insertResult = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, category_id, dutchie_product_id, name, description,
|
||||
store_id, category_id, dutchie_product_id, name, slug, variant, description,
|
||||
price, strain_type, thc_percentage, cbd_percentage,
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata,
|
||||
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
|
||||
RETURNING id
|
||||
`, [
|
||||
storeId, categoryId, product.dutchieProductId, product.name, product.description,
|
||||
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
|
||||
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||
JSON.stringify(product.metadata)
|
||||
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
|
||||
]);
|
||||
productId = insertResult.rows[0].id;
|
||||
}
|
||||
@@ -544,19 +678,15 @@ async function saveProducts(storeId, categoryId, products) {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
async function scrapeStore(storeId) {
|
||||
async function scrapeStore(storeId, parallel = 3, userAgent) {
|
||||
try {
|
||||
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId}`);
|
||||
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
|
||||
const categoriesResult = await migrate_1.pool.query(`
|
||||
SELECT c.id, c.name, c.slug, c.dutchie_url
|
||||
FROM categories c
|
||||
WHERE c.store_id = $1
|
||||
AND c.scrape_enabled = true
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM categories child
|
||||
WHERE child.parent_id = c.id
|
||||
)
|
||||
ORDER BY c.display_order, c.name
|
||||
WHERE c.store_id = $1
|
||||
AND c.scrape_enabled = true
|
||||
ORDER BY c.name
|
||||
`, [storeId]);
|
||||
logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
|
||||
for (const category of categoriesResult.rows) {
|
||||
@@ -564,14 +694,14 @@ async function scrapeStore(storeId) {
|
||||
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
|
||||
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
const products = await scrapeCategory(storeId, category.id);
|
||||
const products = await scrapeCategory(storeId, category.id, userAgent);
|
||||
await saveProducts(storeId, category.id, products);
|
||||
logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
// No delays - scrape fast!
|
||||
}
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE stores
|
||||
|
||||
351
backend/dist/services/store-crawl-orchestrator.js
vendored
Normal file
351
backend/dist/services/store-crawl-orchestrator.js
vendored
Normal file
@@ -0,0 +1,351 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Store Crawl Orchestrator
|
||||
*
|
||||
* Orchestrates the complete crawl workflow for a store:
|
||||
* 1. Load store and its linked dispensary
|
||||
* 2. Check if provider detection is needed
|
||||
* 3. Run provider detection if needed
|
||||
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||
* 5. Update store_crawl_schedule with meaningful status
|
||||
*
|
||||
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
|
||||
exports.runBatchOrchestrator = runBatchOrchestrator;
|
||||
exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
|
||||
const uuid_1 = require("uuid");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_logger_1 = require("./crawler-logger");
|
||||
const intelligence_detector_1 = require("./intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
||||
// DEPRECATED: scrapeStore writes to legacy products table
|
||||
// import { scrapeStore } from '../scraper-v2';
|
||||
// Import the new dutchie-az pipeline for Dutchie crawling
|
||||
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
|
||||
const connection_1 = require("../dutchie-az/db/connection");
|
||||
// ========================================
|
||||
// Main Orchestrator Function
|
||||
// ========================================
|
||||
/**
|
||||
* Run the complete crawl orchestration for a store
|
||||
*
|
||||
* Behavior:
|
||||
* 1. Load the store and its linked dispensary
|
||||
* 2. If no dispensary is linked, report error
|
||||
* 3. If product_provider is missing or stale (>7 days), run detection
|
||||
* 4. After detection:
|
||||
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||
* - Otherwise: Run sandbox crawl
|
||||
* 5. Update store_crawl_schedule with status/summary
|
||||
*/
|
||||
async function runStoreCrawlOrchestrator(storeId) {
|
||||
const startTime = Date.now();
|
||||
const runId = (0, uuid_1.v4)();
|
||||
let result = {
|
||||
status: 'pending',
|
||||
summary: '',
|
||||
runId,
|
||||
storeId,
|
||||
dispensaryId: null,
|
||||
detectionRan: false,
|
||||
crawlRan: false,
|
||||
durationMs: 0,
|
||||
};
|
||||
try {
|
||||
// Mark schedule as running
|
||||
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
|
||||
// 1. Load store with dispensary info
|
||||
const store = await getStoreWithDispensary(storeId);
|
||||
if (!store) {
|
||||
throw new Error(`Store ${storeId} not found`);
|
||||
}
|
||||
result.dispensaryId = store.dispensary_id;
|
||||
// 2. Check if dispensary is linked
|
||||
if (!store.dispensary_id) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No dispensary linked - cannot determine provider';
|
||||
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
|
||||
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
return result;
|
||||
}
|
||||
// 3. Check if provider detection is needed
|
||||
const needsDetection = await checkNeedsDetection(store);
|
||||
if (needsDetection) {
|
||||
// Run provider detection
|
||||
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
|
||||
if (!websiteUrl) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No website URL available for detection';
|
||||
result.error = 'Dispensary has no menu_url or website configured';
|
||||
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
return result;
|
||||
}
|
||||
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
|
||||
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
result.detectionRan = true;
|
||||
result.detectionResult = detectionResult;
|
||||
// Save detection results to dispensary
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
|
||||
crawler_logger_1.crawlerLogger.providerDetected({
|
||||
dispensary_id: store.dispensary_id,
|
||||
dispensary_name: store.dispensary_name || store.name,
|
||||
detected_provider: detectionResult.product.provider,
|
||||
confidence: detectionResult.product.confidence,
|
||||
detection_method: 'orchestrator_run',
|
||||
menu_url: websiteUrl,
|
||||
category: 'product',
|
||||
});
|
||||
// Refresh store info after detection
|
||||
const updatedStore = await getStoreWithDispensary(storeId);
|
||||
if (updatedStore) {
|
||||
Object.assign(store, updatedStore);
|
||||
}
|
||||
}
|
||||
// 4. Determine crawl type and run
|
||||
const provider = store.product_provider;
|
||||
const mode = store.product_crawler_mode;
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
|
||||
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
|
||||
try {
|
||||
// Look up the dispensary in the dutchie-az database
|
||||
// The dutchie-az pipeline has its own dispensaries table
|
||||
// We try multiple matching strategies: name, slug, or platform_dispensary_id
|
||||
const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
|
||||
WHERE name ILIKE $1
|
||||
OR slug ILIKE $2
|
||||
LIMIT 1`, [store.dispensary_name, store.slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
throw new Error(`Dispensary not found in dutchie-az database. ` +
|
||||
`You must add this dispensary to the dutchie-az pipeline first. ` +
|
||||
`Store: ${store.name} (${store.dispensary_name})`);
|
||||
}
|
||||
const dutchieDispensary = dispensaryResult.rows[0];
|
||||
// Run the new dutchie-az GraphQL crawler
|
||||
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
result.productsFound = crawlResult.productsFound ?? undefined;
|
||||
result.productsNew = crawlResult.productsUpserted ?? undefined;
|
||||
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
|
||||
if (crawlResult.success) {
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
|
||||
result.status = 'success';
|
||||
// Update store's last_scraped_at
|
||||
await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
|
||||
crawler_logger_1.crawlerLogger.jobCompleted({
|
||||
job_id: 0, // Orchestrator doesn't create traditional jobs
|
||||
store_id: storeId,
|
||||
store_name: store.name,
|
||||
duration_ms: crawlResult.durationMs,
|
||||
products_found: crawlResult.productsFound || 0,
|
||||
products_new: crawlResult.productsUpserted || 0,
|
||||
products_updated: crawlResult.snapshotsCreated || 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
else {
|
||||
throw new Error(crawlResult.errorMessage || 'Crawl failed');
|
||||
}
|
||||
}
|
||||
catch (crawlError) {
|
||||
result.status = 'error';
|
||||
result.error = crawlError.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
crawler_logger_1.crawlerLogger.jobFailed({
|
||||
job_id: 0,
|
||||
store_id: storeId,
|
||||
store_name: store.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_message: crawlError.message,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (provider && provider !== 'unknown') {
|
||||
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
|
||||
try {
|
||||
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
if (sandboxResult.success) {
|
||||
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||
result.status = 'sandbox_only';
|
||||
}
|
||||
else {
|
||||
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||
result.status = 'error';
|
||||
result.error = sandboxResult.message;
|
||||
}
|
||||
}
|
||||
catch (sandboxError) {
|
||||
result.status = 'error';
|
||||
result.error = sandboxError.message;
|
||||
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No provider detected - detection only
|
||||
if (result.detectionRan) {
|
||||
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
|
||||
result.status = 'detection_only';
|
||||
}
|
||||
else {
|
||||
result.summary = 'No provider detected and no crawl possible';
|
||||
result.status = 'error';
|
||||
result.error = 'Could not determine menu provider';
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.status = 'error';
|
||||
result.error = error.message;
|
||||
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||
crawler_logger_1.crawlerLogger.queueFailure({
|
||||
queue_type: 'orchestrator',
|
||||
error_message: error.message,
|
||||
});
|
||||
}
|
||||
result.durationMs = Date.now() - startTime;
|
||||
// Update final schedule status
|
||||
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
|
||||
// Create a crawl_job record for tracking
|
||||
await createOrchestratorJobRecord(storeId, result);
|
||||
return result;
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getStoreWithDispensary(storeId) {
|
||||
const result = await migrate_1.pool.query(`SELECT
|
||||
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url,
|
||||
d.website as dispensary_website,
|
||||
d.product_provider,
|
||||
d.product_confidence,
|
||||
d.product_crawler_mode,
|
||||
d.last_product_scan_at
|
||||
FROM stores s
|
||||
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
WHERE s.id = $1`, [storeId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function checkNeedsDetection(store) {
|
||||
// No dispensary = can't detect
|
||||
if (!store.dispensary_id)
|
||||
return false;
|
||||
// No provider = definitely needs detection
|
||||
if (!store.product_provider)
|
||||
return true;
|
||||
// Unknown provider = needs detection
|
||||
if (store.product_provider === 'unknown')
|
||||
return true;
|
||||
// Low confidence = needs re-detection
|
||||
if (store.product_confidence !== null && store.product_confidence < 50)
|
||||
return true;
|
||||
// Stale detection (> 7 days) = needs refresh
|
||||
if (store.last_product_scan_at) {
|
||||
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||
if (daysSince > 7)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async function updateScheduleStatus(storeId, status, summary, runId, error) {
|
||||
await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
|
||||
VALUES ($1, $2, $3, NOW(), $4)
|
||||
ON CONFLICT (store_id) DO UPDATE SET
|
||||
last_status = $2,
|
||||
last_summary = $3,
|
||||
last_run_at = NOW(),
|
||||
last_error = $4,
|
||||
updated_at = NOW()`, [storeId, status, summary, error || null]);
|
||||
}
|
||||
async function getLatestCrawlStats(storeId) {
|
||||
// Get count of products for this store
|
||||
const result = await migrate_1.pool.query(`SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
|
||||
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
|
||||
FROM products
|
||||
WHERE store_id = $1`, [storeId]);
|
||||
return {
|
||||
products_found: parseInt(result.rows[0]?.total || '0'),
|
||||
products_new: parseInt(result.rows[0]?.recent_new || '0'),
|
||||
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
|
||||
};
|
||||
}
|
||||
async function createOrchestratorJobRecord(storeId, result) {
|
||||
await migrate_1.pool.query(`INSERT INTO crawl_jobs (
|
||||
store_id, job_type, trigger_type, status, priority,
|
||||
scheduled_at, started_at, completed_at,
|
||||
products_found, products_new, products_updated,
|
||||
error_message, orchestrator_run_id, detection_result
|
||||
) VALUES (
|
||||
$1, 'orchestrator', 'manual', $2, 100,
|
||||
NOW(), NOW(), NOW(),
|
||||
$3, $4, $5,
|
||||
$6, $7, $8
|
||||
)`, [
|
||||
storeId,
|
||||
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||
result.productsFound || null,
|
||||
result.productsNew || null,
|
||||
result.productsUpdated || null,
|
||||
result.error || null,
|
||||
result.runId,
|
||||
result.detectionResult ? JSON.stringify({
|
||||
product_provider: result.detectionResult.product.provider,
|
||||
product_confidence: result.detectionResult.product.confidence,
|
||||
product_mode: result.detectionResult.product.mode,
|
||||
}) : null,
|
||||
]);
|
||||
}
|
||||
// ========================================
|
||||
// Batch Orchestration
|
||||
// ========================================
|
||||
/**
|
||||
* Run orchestrator for multiple stores
|
||||
*/
|
||||
async function runBatchOrchestrator(storeIds, concurrency = 3) {
|
||||
const results = [];
|
||||
// Process in batches
|
||||
for (let i = 0; i < storeIds.length; i += concurrency) {
|
||||
const batch = storeIds.slice(i, i + concurrency);
|
||||
const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
|
||||
results.push(...batchResults);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
/**
|
||||
* Get stores that are due for orchestration
|
||||
*/
|
||||
async function getStoresDueForOrchestration(limit = 10) {
|
||||
const result = await migrate_1.pool.query(`SELECT s.id
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||
AND (
|
||||
scs.last_run_at IS NULL
|
||||
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
|
||||
)
|
||||
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
|
||||
LIMIT $1`, [limit]);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
Reference in New Issue
Block a user