Files
cannaiq/backend/src/tasks/handlers/product-discovery-dutchie.ts
Kelly 9f3bc8a843
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: Worker task concurrency limit and inventory tracking
- Fix claim_task to enforce max 5 tasks per worker (was unlimited)
- Add session_task_count check before ANY claiming path
- Add triggers to auto-decrement count on task complete/release
- Update MAX_CONCURRENT_TASKS default from 3 to 5
- Update frontend fallback to show 5 task slots

- Add Wasabi S3 storage for payload archival
- Add inventory snapshots service (delta-only tracking)
- Add sales analytics views and routes
- Add high-frequency manager UI components
- Reset hardcoded AZ 5-minute intervals (use UI to configure)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 01:34:38 -07:00

537 lines
19 KiB
TypeScript

/**
* Product Discovery Handler - Dutchie Platform
*
* Uses Puppeteer + StealthPlugin to fetch products via browser context.
* Based on test-intercept.js pattern from ORGANIC_SCRAPING_GUIDE.md.
*
* Naming convention: {task}-{platform}.ts
*
* This handler:
* 1. Loads dispensary info
* 2. Launches headless browser with proxy (if provided)
* 3. Establishes session by visiting embedded menu
* 4. Fetches ALL products via GraphQL from browser context
* 5. Saves raw payload to filesystem (gzipped) at payloads/dutchie/...
* 6. Records metadata in raw_crawl_payloads table
* 7. Queues product_refresh task to process the payload
*
* Why browser-based:
* - Works with session-based residential proxies (Evomi)
* - Lower detection risk than curl/axios
* - Real Chrome TLS fingerprint
*/
import { TaskContext, TaskResult } from '../task-worker';
import { saveDailyBaseline } from '../../utils/payload-storage';
import { taskService } from '../task-service';
import { saveInventorySnapshots } from '../../services/inventory-snapshots';
import { detectVisibilityEvents } from '../../services/visibility-events';
import { storePayload as storeWasabiPayload, checkConnection as checkWasabiConnection } from '../../services/wasabi-storage';
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator, updateStep } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return { success: false, error: 'No dispensary_id specified for product_discovery task' };
}
let browser: any = null;
try {
// ============================================================
// STEP 1: Load dispensary info
// ============================================================
updateStep('loading', 'Loading dispensary info');
const dispResult = await pool.query(`
SELECT
id, name, platform_dispensary_id, menu_url, menu_type, city, state
FROM dispensaries
WHERE id = $1 AND crawl_enabled = true
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
}
const dispensary = dispResult.rows[0];
const platformId = dispensary.platform_dispensary_id;
if (!platformId) {
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
}
// Extract cName from menu_url
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(`[ProductDiscoveryHTTP] Starting for ${dispensary.name} (ID: ${dispensaryId})`);
console.log(`[ProductDiscoveryHTTP] Platform ID: ${platformId}, cName: ${cName}`);
await ctx.heartbeat();
// ============================================================
// STEP 2: Setup Puppeteer with proxy
// ============================================================
updateStep('preflight', `Launching browser for ${dispensary.name}`);
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Get proxy from CrawlRotator if available
let proxyUrl: string | null = null;
if (crawlRotator) {
const currentProxy = crawlRotator.proxy.getCurrent();
if (currentProxy) {
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
console.log(`[ProductDiscoveryHTTP] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
}
}
// Build browser args
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl);
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
}
browser = await puppeteer.launch({
headless: 'new',
args: browserArgs,
});
const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
// We only need HTML/JS for session, then GraphQL JSON
await page.setRequestInterception(true);
// Domains to block - analytics, tracking, feature flags (not needed for GraphQL)
const BLOCKED_DOMAINS = [
'googletagmanager.com',
'google-analytics.com',
'launchdarkly.com',
'assets2.dutchie.com', // CDN assets - we only need GraphQL
'sentry.io',
'segment.io',
'segment.com',
'amplitude.com',
'mixpanel.com',
'hotjar.com',
'fullstory.com',
];
page.on('request', (request: any) => {
const url = request.url();
const resourceType = request.resourceType();
// Block by domain - saves significant proxy bandwidth
if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) {
request.abort();
return;
}
// Block by resource type - images, fonts, media, stylesheets
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
return;
}
request.continue();
});
// Setup proxy auth if needed
if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl);
if (proxyUrlParsed.username && proxyUrlParsed.password) {
await page.authenticate({
username: decodeURIComponent(proxyUrlParsed.username),
password: decodeURIComponent(proxyUrlParsed.password),
});
}
}
// ============================================================
// STEP 2b: Apply stored fingerprint (timezone, locale)
// CRITICAL: Must match the IP's geographic location
// ============================================================
if (ctx.fingerprint?.timezone) {
try {
const client = await page.target().createCDPSession();
await client.send('Emulation.setTimezoneOverride', { timezoneId: ctx.fingerprint.timezone });
console.log(`[ProductDiscoveryHTTP] Browser timezone set to: ${ctx.fingerprint.timezone}`);
} catch (tzErr: any) {
console.warn(`[ProductDiscoveryHTTP] Failed to set timezone: ${tzErr.message}`);
}
}
// Set locale to match proxy region (en-US for US proxies)
if (ctx.fingerprint?.locale) {
await page.setExtraHTTPHeaders({
'Accept-Language': `${ctx.fingerprint.locale},en;q=0.9`,
});
console.log(`[ProductDiscoveryHTTP] Accept-Language set to: ${ctx.fingerprint.locale}`);
}
await ctx.heartbeat();
// ============================================================
// STEP 3: Establish session by visiting embedded menu
// ============================================================
updateStep('navigating', `Loading menu page`);
const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`;
console.log(`[ProductDiscoveryHTTP] Establishing session at ${embedUrl}...`);
await page.goto(embedUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
// ============================================================
// STEP 3b: Detect and dismiss age gate modal
// ============================================================
try {
// Wait a bit for age gate to appear
await page.waitForTimeout(1500);
// Look for common age gate selectors
const ageGateSelectors = [
'button[data-testid="age-gate-submit"]',
'button:has-text("Yes")',
'button:has-text("I am 21")',
'button:has-text("Enter")',
'[class*="age-gate"] button',
'[class*="AgeGate"] button',
'[data-test="age-gate-button"]',
];
for (const selector of ageGateSelectors) {
try {
const button = await page.$(selector);
if (button) {
await button.click();
console.log(`[ProductDiscoveryHTTP] Age gate dismissed via: ${selector}`);
await page.waitForTimeout(1000); // Wait for modal to close
break;
}
} catch {
// Selector not found, try next
}
}
// Also try evaluating in page context for button with specific text
await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
const text = btn.textContent?.toLowerCase() || '';
if (text.includes('yes') || text.includes('enter') || text.includes('21')) {
(btn as HTMLButtonElement).click();
return true;
}
}
return false;
});
} catch (ageGateErr) {
// Age gate might not be present, continue
console.log(`[ProductDiscoveryHTTP] No age gate detected or already dismissed`);
}
console.log(`[ProductDiscoveryHTTP] Session established, fetching products...`);
await ctx.heartbeat();
// ============================================================
// STEP 4: Fetch ALL products via GraphQL from browser context
// ============================================================
updateStep('fetching', `Executing GraphQL query`);
const result = await page.evaluate(async (platformId: string, graphqlHash: string) => {
const allProducts: any[] = [];
const logs: string[] = [];
let pageNum = 0;
const perPage = 100;
let totalCount = 0;
const sessionId = 'browser-session-' + Date.now();
try {
while (pageNum < 30) { // Max 30 pages = 3000 products
const variables = {
includeEnterpriseSpecials: true, // Include BOGO/sale special names in product data
productsFilter: {
dispensaryId: platformId,
pricingType: 'rec',
Status: 'All', // 'All' = Active + Inactive products for sellout tracking
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: pageNum,
perPage: perPage,
};
const extensions = {
persistedQuery: {
version: 1,
sha256Hash: graphqlHash,
},
};
// Build GET URL like the browser does
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify(extensions),
});
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
const response = await fetch(url, {
method: 'GET',
headers: {
'Accept': 'application/json',
'content-type': 'application/json',
'x-dutchie-session': sessionId,
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include',
});
logs.push(`Page ${pageNum}: HTTP ${response.status}`);
if (!response.ok) {
const text = await response.text();
logs.push(`HTTP error: ${response.status} - ${text.slice(0, 200)}`);
break;
}
const json = await response.json();
if (json.errors) {
logs.push(`GraphQL error: ${JSON.stringify(json.errors).slice(0, 200)}`);
break;
}
const data = json?.data?.filteredProducts;
if (!data || !data.products) {
logs.push('No products in response');
break;
}
const products = data.products;
allProducts.push(...products);
if (pageNum === 0) {
totalCount = data.queryInfo?.totalCount || 0;
logs.push(`Total reported: ${totalCount}`);
}
logs.push(`Got ${products.length} products (total: ${allProducts.length}/${totalCount})`);
if (allProducts.length >= totalCount || products.length < perPage) {
break;
}
pageNum++;
// Small delay between pages to be polite
await new Promise(r => setTimeout(r, 200));
}
} catch (err: any) {
logs.push(`Error: ${err.message}`);
}
return { products: allProducts, totalCount, logs };
}, platformId, FILTERED_PRODUCTS_HASH);
// Print logs from browser context
result.logs.forEach((log: string) => console.log(`[Browser] ${log}`));
console.log(`[ProductDiscoveryHTTP] Fetched ${result.products.length} products (API reported ${result.totalCount})`);
await browser.close();
browser = null;
if (result.products.length === 0) {
return {
success: false,
error: 'No products returned from GraphQL',
productsProcessed: 0,
};
}
await ctx.heartbeat();
// ============================================================
// STEP 5: Archive raw payload to Wasabi S3 (long-term storage)
// Every crawl is archived for potential reprocessing
// ============================================================
updateStep('saving', `Saving ${result.products.length} products`);
const rawPayload = {
dispensaryId,
platformId,
cName,
fetchedAt: new Date().toISOString(),
productCount: result.products.length,
products: result.products,
};
// Archive to Wasabi S3 (if configured)
let wasabiPath: string | null = null;
try {
const wasabiResult = await storeWasabiPayload(
dispensaryId,
dispensary.state || 'XX',
'dutchie',
rawPayload,
{
taskId: String(task.id),
cName,
productCount: String(result.products.length),
}
);
wasabiPath = wasabiResult.path;
const compressionRatio = Math.round((1 - wasabiResult.compressedBytes / wasabiResult.sizeBytes) * 100);
console.log(`[ProductDiscoveryHTTP] Archived to Wasabi: ${wasabiPath} (${(wasabiResult.compressedBytes / 1024).toFixed(1)}KB, ${compressionRatio}% compression)`);
} catch (wasabiErr: any) {
// Wasabi archival is optional - don't fail the task if it fails
if (wasabiErr.message?.includes('not configured')) {
console.log(`[ProductDiscoveryHTTP] Wasabi not configured, skipping archive`);
} else {
console.warn(`[ProductDiscoveryHTTP] Wasabi archive failed: ${wasabiErr.message}`);
}
}
// ============================================================
// STEP 5b: Save daily baseline to PostgreSQL (if in window)
// Daily baselines are saved once per day per store (12:01 AM - 3:00 AM)
// Outside this window, only inventory snapshots are saved (Step 5.5)
// ============================================================
// saveDailyBaseline returns null if outside window or baseline already exists today
const payloadResult = await saveDailyBaseline(
pool,
dispensaryId,
rawPayload,
null, // crawl_run_id - not using crawl_runs in new system
result.products.length,
'dutchie',
task.id // task ID for traceability
);
if (payloadResult) {
console.log(`[ProductDiscoveryHTTP] Saved daily baseline #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
} else {
console.log(`[ProductDiscoveryHTTP] Skipped PostgreSQL baseline (outside window or already exists)`);
}
// ============================================================
// STEP 5.5: Save inventory snapshots and detect visibility events
// ============================================================
const snapshotCount = await saveInventorySnapshots(pool, dispensaryId, result.products, 'dutchie');
const eventCount = await detectVisibilityEvents(pool, dispensaryId, result.products, 'dutchie');
console.log(`[ProductDiscoveryHTTP] Saved ${snapshotCount} inventory snapshots, detected ${eventCount} visibility events`);
// ============================================================
// STEP 6: Update dispensary last_fetch_at and tracking
// ============================================================
await pool.query(`
UPDATE dispensaries
SET last_fetch_at = NOW(),
last_modified_at = NOW(),
last_modified_by_task = $2,
last_modified_task_id = $3
WHERE id = $1
`, [dispensaryId, task.role, task.id]);
// ============================================================
// STEP 7: Queue product_refresh task to process the payload
// Only queue if a baseline payload was saved (need payload_id)
// ============================================================
if (payloadResult) {
await taskService.createTask({
role: 'product_refresh',
dispensary_id: dispensaryId,
priority: task.priority || 0,
method: 'http', // Browser-only transport
payload: { payload_id: payloadResult.id },
});
console.log(`[ProductDiscoveryHTTP] Queued product_refresh task for payload #${payloadResult.id}`);
} else {
console.log(`[ProductDiscoveryHTTP] Skipped product_refresh (no payload saved)`);
}
// ============================================================
// STEP 8: Stage checkpoint - observational update
// Discovery success → hydrating (awaiting product_refresh completion)
// ============================================================
await pool.query(`
UPDATE dispensaries
SET
stage = CASE
WHEN stage IN ('promoted', 'sandbox') THEN 'hydrating'
WHEN stage = 'failing' THEN 'hydrating'
ELSE stage
END,
consecutive_successes = COALESCE(consecutive_successes, 0) + 1,
consecutive_failures = 0
WHERE id = $1
`, [dispensaryId]);
console.log(`[ProductDiscoveryHTTP] Stage checkpoint: hydrating`);
return {
success: true,
payloadId: payloadResult?.id || null,
productCount: result.products.length,
sizeBytes: payloadResult?.sizeBytes || 0,
baselineSaved: !!payloadResult,
wasabiPath,
snapshotCount,
eventCount,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[ProductDiscoveryHTTP] Error for dispensary ${dispensaryId}:`, errorMessage);
// Stage checkpoint - track failures
// After 3+ consecutive failures, stage transitions to 'failing'
try {
const failureResult = await pool.query(`
UPDATE dispensaries
SET
consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
consecutive_successes = 0,
stage = CASE
WHEN COALESCE(consecutive_failures, 0) + 1 >= 3 THEN 'failing'
ELSE stage
END
WHERE id = $1
RETURNING consecutive_failures, stage
`, [dispensaryId]);
if (failureResult.rows[0]) {
const { consecutive_failures, stage } = failureResult.rows[0];
console.log(`[ProductDiscoveryHTTP] Failure tracked: ${consecutive_failures} consecutive failures, stage: ${stage}`);
}
} catch (trackError) {
// Don't let tracking errors mask the original error
console.error(`[ProductDiscoveryHTTP] Failed to track failure:`, trackError);
}
return {
success: false,
error: errorMessage,
};
} finally {
if (browser) {
await browser.close().catch(() => {});
}
}
}