Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
260 lines
12 KiB
TypeScript
260 lines
12 KiB
TypeScript
#!/usr/bin/env npx tsx
|
|
/**
|
|
* Crawl Single Store - Verbose test showing each step
|
|
*
|
|
* Usage:
|
|
* DATABASE_URL="postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
|
* npx tsx src/scripts/crawl-single-store.ts <dispensaryId>
|
|
*
|
|
* Example:
|
|
* DATABASE_URL="..." npx tsx src/scripts/crawl-single-store.ts 112
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import dotenv from 'dotenv';
|
|
import {
|
|
executeGraphQL,
|
|
startSession,
|
|
endSession,
|
|
setCrawlRotator,
|
|
GRAPHQL_HASHES,
|
|
DUTCHIE_CONFIG,
|
|
} from '../platforms/dutchie';
|
|
import { CrawlRotator } from '../services/crawl-rotator';
|
|
|
|
dotenv.config();
|
|
|
|
// ============================================================
|
|
// DATABASE CONNECTION
|
|
// ============================================================
|
|
|
|
function getConnectionString(): string {
|
|
if (process.env.DATABASE_URL) {
|
|
return process.env.DATABASE_URL;
|
|
}
|
|
if (process.env.CANNAIQ_DB_URL) {
|
|
return process.env.CANNAIQ_DB_URL;
|
|
}
|
|
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
|
|
const port = process.env.CANNAIQ_DB_PORT || '54320';
|
|
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
|
|
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
|
|
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
|
|
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
|
}
|
|
|
|
const pool = new Pool({ connectionString: getConnectionString() });
|
|
|
|
// ============================================================
|
|
// MAIN
|
|
// ============================================================
|
|
|
|
async function main() {
|
|
const dispensaryId = parseInt(process.argv[2], 10);
|
|
|
|
if (!dispensaryId) {
|
|
console.error('Usage: npx tsx src/scripts/crawl-single-store.ts <dispensaryId>');
|
|
console.error('Example: npx tsx src/scripts/crawl-single-store.ts 112');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('');
|
|
console.log('╔════════════════════════════════════════════════════════════╗');
|
|
console.log('║ SINGLE STORE CRAWL - VERBOSE OUTPUT ║');
|
|
console.log('╚════════════════════════════════════════════════════════════╝');
|
|
console.log('');
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 1: Get dispensary info from database
|
|
// ============================================================
|
|
console.log('┌─────────────────────────────────────────────────────────────┐');
|
|
console.log('│ STEP 1: Load Dispensary Info from Database │');
|
|
console.log('└─────────────────────────────────────────────────────────────┘');
|
|
|
|
const dispResult = await pool.query(`
|
|
SELECT
|
|
id,
|
|
name,
|
|
platform_dispensary_id,
|
|
menu_url,
|
|
menu_type,
|
|
city,
|
|
state
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
if (dispResult.rows.length === 0) {
|
|
throw new Error(`Dispensary ${dispensaryId} not found`);
|
|
}
|
|
|
|
const disp = dispResult.rows[0];
|
|
console.log(` Dispensary ID: ${disp.id}`);
|
|
console.log(` Name: ${disp.name}`);
|
|
console.log(` City, State: ${disp.city}, ${disp.state}`);
|
|
console.log(` Menu Type: ${disp.menu_type}`);
|
|
console.log(` Platform ID: ${disp.platform_dispensary_id}`);
|
|
console.log(` Menu URL: ${disp.menu_url}`);
|
|
|
|
if (!disp.platform_dispensary_id) {
|
|
throw new Error('Dispensary does not have a platform_dispensary_id - cannot crawl');
|
|
}
|
|
|
|
// Extract cName from menu_url
|
|
const cNameMatch = disp.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
|
|
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
|
|
console.log(` cName (derived): ${cName}`);
|
|
console.log('');
|
|
|
|
// ============================================================
|
|
// STEP 2: Start stealth session
|
|
// Per workflow-12102025.md: Initialize CrawlRotator and start session with menuUrl
|
|
// ============================================================
|
|
console.log('┌─────────────────────────────────────────────────────────────┐');
|
|
console.log('│ STEP 2: Start Stealth Session │');
|
|
console.log('└─────────────────────────────────────────────────────────────┘');
|
|
|
|
// Per workflow-12102025.md: Initialize CrawlRotator (required for sessions)
|
|
const rotator = new CrawlRotator();
|
|
setCrawlRotator(rotator);
|
|
|
|
// Per workflow-12102025.md: startSession takes menuUrl for dynamic Referer
|
|
const session = startSession(disp.menu_url);
|
|
|
|
const fp = session.fingerprint;
|
|
console.log(` Session ID: ${session.sessionId}`);
|
|
console.log(` Browser: ${fp.browserName} (${fp.deviceCategory})`);
|
|
console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`);
|
|
console.log(` Accept-Language: ${fp.acceptLanguage}`);
|
|
console.log(` Referer: ${session.referer}`);
|
|
console.log(` DNT: ${fp.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
|
console.log(` TLS: ${fp.httpFingerprint.curlImpersonateBinary}`);
|
|
console.log('');
|
|
|
|
// ============================================================
|
|
// STEP 3: Execute GraphQL query
|
|
// ============================================================
|
|
console.log('┌─────────────────────────────────────────────────────────────┐');
|
|
console.log('│ STEP 3: Execute GraphQL Query (FilteredProducts) │');
|
|
console.log('└─────────────────────────────────────────────────────────────┘');
|
|
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId: disp.platform_dispensary_id,
|
|
pricingType: 'rec',
|
|
Status: 'Active',
|
|
types: [],
|
|
useCache: true,
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page: 0,
|
|
perPage: 100,
|
|
};
|
|
|
|
console.log(` Endpoint: ${DUTCHIE_CONFIG.graphqlEndpoint}`);
|
|
console.log(` Operation: FilteredProducts`);
|
|
console.log(` Hash: ${GRAPHQL_HASHES.FilteredProducts.slice(0, 20)}...`);
|
|
console.log(` dispensaryId: ${variables.productsFilter.dispensaryId}`);
|
|
console.log(` pricingType: ${variables.productsFilter.pricingType}`);
|
|
console.log(` Status: ${variables.productsFilter.Status}`);
|
|
console.log(` perPage: ${variables.perPage}`);
|
|
console.log('');
|
|
console.log(' Sending request...');
|
|
|
|
const startTime = Date.now();
|
|
const result = await executeGraphQL(
|
|
'FilteredProducts',
|
|
variables,
|
|
GRAPHQL_HASHES.FilteredProducts,
|
|
{ cName, maxRetries: 3 }
|
|
);
|
|
const elapsed = Date.now() - startTime;
|
|
|
|
console.log(` Response time: ${elapsed}ms`);
|
|
console.log('');
|
|
|
|
// ============================================================
|
|
// STEP 4: Process response
|
|
// ============================================================
|
|
console.log('┌─────────────────────────────────────────────────────────────┐');
|
|
console.log('│ STEP 4: Process Response │');
|
|
console.log('└─────────────────────────────────────────────────────────────┘');
|
|
|
|
const data = result?.data?.filteredProducts;
|
|
if (!data) {
|
|
console.log(' ERROR: No data returned from GraphQL');
|
|
console.log(' Raw result:', JSON.stringify(result, null, 2).slice(0, 500));
|
|
endSession();
|
|
return;
|
|
}
|
|
|
|
const products = data.products || [];
|
|
const totalCount = data.queryInfo?.totalCount || 0;
|
|
const totalPages = Math.ceil(totalCount / 100);
|
|
|
|
console.log(` Total products: ${totalCount}`);
|
|
console.log(` Products in page: ${products.length}`);
|
|
console.log(` Total pages: ${totalPages}`);
|
|
console.log('');
|
|
|
|
// Show first few products
|
|
console.log(' First 5 products:');
|
|
console.log(' ─────────────────────────────────────────────────────────');
|
|
for (let i = 0; i < Math.min(5, products.length); i++) {
|
|
const p = products[i];
|
|
const name = (p.name || 'Unknown').slice(0, 40);
|
|
const brand = (p.brand?.name || 'Unknown').slice(0, 15);
|
|
const price = p.Prices?.[0]?.price || p.medPrice || p.recPrice || 'N/A';
|
|
const category = p.type || p.category || 'N/A';
|
|
console.log(` ${i + 1}. ${name.padEnd(42)} | ${brand.padEnd(17)} | $${price}`);
|
|
}
|
|
console.log('');
|
|
|
|
// ============================================================
|
|
// STEP 5: End session
|
|
// ============================================================
|
|
console.log('┌─────────────────────────────────────────────────────────────┐');
|
|
console.log('│ STEP 5: End Session │');
|
|
console.log('└─────────────────────────────────────────────────────────────┘');
|
|
|
|
endSession();
|
|
console.log('');
|
|
|
|
// ============================================================
|
|
// SUMMARY
|
|
// ============================================================
|
|
console.log('╔════════════════════════════════════════════════════════════╗');
|
|
console.log('║ SUMMARY ║');
|
|
console.log('╠════════════════════════════════════════════════════════════╣');
|
|
console.log(`║ Store: ${disp.name.slice(0, 38).padEnd(38)} ║`);
|
|
console.log(`║ Products Found: ${String(totalCount).padEnd(38)} ║`);
|
|
console.log(`║ Response Time: ${(elapsed + 'ms').padEnd(38)} ║`);
|
|
console.log(`║ Status: ${'SUCCESS'.padEnd(38)} ║`);
|
|
console.log('╚════════════════════════════════════════════════════════════╝');
|
|
|
|
} catch (error: any) {
|
|
console.error('');
|
|
console.error('╔════════════════════════════════════════════════════════════╗');
|
|
console.error('║ ERROR ║');
|
|
console.error('╚════════════════════════════════════════════════════════════╝');
|
|
console.error(` ${error.message}`);
|
|
if (error.stack) {
|
|
console.error('');
|
|
console.error('Stack trace:');
|
|
console.error(error.stack.split('\n').slice(0, 5).join('\n'));
|
|
}
|
|
process.exit(1);
|
|
} finally {
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
main();
|