feat: Stealth worker system with mandatory proxy rotation

## Worker System
- Role-agnostic workers that can handle any task type
- Pod-based architecture with StatefulSet (5-15 pods, 5 workers each)
- Custom pod names (Aethelgard, Xylos, Kryll, etc.)
- Worker registry with friendly names and resource monitoring
- Hub-and-spoke visualization on JobQueue page

## Stealth & Anti-Detection (REQUIRED)
- Proxies are MANDATORY - workers fail to start without active proxies
- CrawlRotator initializes on worker startup
- Loads proxies from `proxies` table
- Auto-rotates proxy + fingerprint on 403 errors
- 12 browser fingerprints (Chrome, Firefox, Safari, Edge)
- Locale/timezone matching for geographic consistency

## Task System
- Renamed product_resync → product_refresh
- Task chaining: store_discovery → entry_point → product_discovery
- Priority-based claiming with FOR UPDATE SKIP LOCKED
- Heartbeat and stale task recovery

## UI Updates
- JobQueue: Pod visualization, resource monitoring on hover
- WorkersDashboard: Simplified worker list
- Removed unused filters from task list

## Other
- IP2Location service for visitor analytics
- Findagram consumer features scaffolding
- Documentation updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 00:44:59 -07:00
parent 0295637ed6
commit 56cc171287
61 changed files with 8591 additions and 2076 deletions

View File

@@ -0,0 +1,344 @@
/**
* Product Refresh Handler
*
* Re-crawls a store to capture price/stock changes using the GraphQL pipeline.
*
* Flow:
* 1. Load dispensary info from database
* 2. Start stealth session (fingerprint + optional proxy)
* 3. Fetch products via GraphQL (Status: 'All')
* 4. Normalize data via DutchieNormalizer
* 5. Upsert to store_products and store_product_snapshots
* 6. Track missing products (increment consecutive_misses, mark OOS at 3)
* 7. Download new product images
* 8. End session
*/
import { TaskContext, TaskResult } from '../task-worker';
import {
executeGraphQL,
startSession,
endSession,
GRAPHQL_HASHES,
DUTCHIE_CONFIG,
} from '../../platforms/dutchie';
import { DutchieNormalizer } from '../../hydration/normalizers/dutchie';
import {
upsertStoreProducts,
createStoreProductSnapshots,
downloadProductImages,
} from '../../hydration/canonical-upsert';
const normalizer = new DutchieNormalizer();
export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult> {
const { pool, task } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return { success: false, error: 'No dispensary_id specified for product_refresh task' };
}
try {
// ============================================================
// STEP 1: Load dispensary info
// ============================================================
const dispResult = await pool.query(`
SELECT
id, name, platform_dispensary_id, menu_url, menu_type, city, state
FROM dispensaries
WHERE id = $1 AND crawl_enabled = true
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
}
const dispensary = dispResult.rows[0];
const platformId = dispensary.platform_dispensary_id;
if (!platformId) {
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
}
// Extract cName from menu_url
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(`[ProductResync] Starting crawl for ${dispensary.name} (ID: ${dispensaryId})`);
console.log(`[ProductResync] Platform ID: ${platformId}, cName: ${cName}`);
// ============================================================
// STEP 2: Start stealth session
// ============================================================
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
console.log(`[ProductResync] Session started: ${session.sessionId}`);
await ctx.heartbeat();
// ============================================================
// STEP 3: Fetch products via GraphQL (Status: 'All')
// ============================================================
const allProducts: any[] = [];
let page = 0;
let totalCount = 0;
const perPage = DUTCHIE_CONFIG.perPage;
const maxPages = DUTCHIE_CONFIG.maxPages;
try {
while (page < maxPages) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformId,
pricingType: 'rec',
Status: 'All',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
console.log(`[ProductResync] Fetching page ${page + 1}...`);
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const data = result?.data?.filteredProducts;
if (!data || !data.products) {
if (page === 0) {
throw new Error('No product data returned from GraphQL');
}
break;
}
const products = data.products;
allProducts.push(...products);
if (page === 0) {
totalCount = data.queryInfo?.totalCount || products.length;
console.log(`[ProductResync] Total products reported: ${totalCount}`);
}
if (allProducts.length >= totalCount || products.length < perPage) {
break;
}
page++;
if (page < maxPages) {
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
}
if (page % 5 === 0) {
await ctx.heartbeat();
}
}
console.log(`[ProductResync] Fetched ${allProducts.length} products in ${page + 1} pages`);
} finally {
endSession();
}
if (allProducts.length === 0) {
return {
success: false,
error: 'No products returned from GraphQL',
productsProcessed: 0,
};
}
await ctx.heartbeat();
// ============================================================
// STEP 4: Normalize data
// ============================================================
console.log(`[ProductResync] Normalizing ${allProducts.length} products...`);
// Build RawPayload for the normalizer
const rawPayload = {
id: `resync-${dispensaryId}-${Date.now()}`,
dispensary_id: dispensaryId,
crawl_run_id: null,
platform: 'dutchie',
payload_version: 1,
raw_json: { data: { filteredProducts: { products: allProducts } } },
product_count: allProducts.length,
pricing_type: 'dual',
crawl_mode: 'dual_mode',
fetched_at: new Date(),
processed: false,
normalized_at: null,
hydration_error: null,
hydration_attempts: 0,
created_at: new Date(),
};
const normalizationResult = normalizer.normalize(rawPayload);
if (normalizationResult.errors.length > 0) {
console.warn(`[ProductResync] Normalization warnings: ${normalizationResult.errors.map(e => e.message).join(', ')}`);
}
if (normalizationResult.products.length === 0) {
return {
success: false,
error: 'Normalization produced no products',
productsProcessed: 0,
};
}
console.log(`[ProductResync] Normalized ${normalizationResult.products.length} products`);
await ctx.heartbeat();
// ============================================================
// STEP 5: Upsert to canonical tables
// ============================================================
console.log(`[ProductResync] Upserting to store_products...`);
const upsertResult = await upsertStoreProducts(
pool,
normalizationResult.products,
normalizationResult.pricing,
normalizationResult.availability
);
console.log(`[ProductResync] Upserted: ${upsertResult.upserted} (${upsertResult.new} new, ${upsertResult.updated} updated)`);
await ctx.heartbeat();
// Create snapshots
console.log(`[ProductResync] Creating snapshots...`);
const snapshotsResult = await createStoreProductSnapshots(
pool,
dispensaryId,
normalizationResult.products,
normalizationResult.pricing,
normalizationResult.availability,
null // No crawl_run_id in new system
);
console.log(`[ProductResync] Created ${snapshotsResult.created} snapshots`);
await ctx.heartbeat();
// ============================================================
// STEP 6: Track missing products (consecutive_misses logic)
// - Products in feed: reset consecutive_misses to 0
// - Products not in feed: increment consecutive_misses
// - At 3 consecutive misses: mark as OOS
// ============================================================
const currentProductIds = allProducts
.map((p: any) => p._id || p.id)
.filter(Boolean);
// Reset consecutive_misses for products that ARE in the feed
if (currentProductIds.length > 0) {
await pool.query(`
UPDATE store_products
SET consecutive_misses = 0, last_seen_at = NOW()
WHERE dispensary_id = $1
AND provider = 'dutchie'
AND provider_product_id = ANY($2)
`, [dispensaryId, currentProductIds]);
}
// Increment consecutive_misses for products NOT in the feed
const incrementResult = await pool.query(`
UPDATE store_products
SET consecutive_misses = consecutive_misses + 1
WHERE dispensary_id = $1
AND provider = 'dutchie'
AND provider_product_id NOT IN (SELECT unnest($2::text[]))
AND consecutive_misses < 3
RETURNING id
`, [dispensaryId, currentProductIds]);
const incrementedCount = incrementResult.rowCount || 0;
if (incrementedCount > 0) {
console.log(`[ProductResync] Incremented consecutive_misses for ${incrementedCount} products`);
}
// Mark as OOS any products that hit 3 consecutive misses
const oosResult = await pool.query(`
UPDATE store_products
SET stock_status = 'oos', is_in_stock = false
WHERE dispensary_id = $1
AND provider = 'dutchie'
AND consecutive_misses >= 3
AND stock_status != 'oos'
RETURNING id
`, [dispensaryId]);
const markedOosCount = oosResult.rowCount || 0;
if (markedOosCount > 0) {
console.log(`[ProductResync] Marked ${markedOosCount} products as OOS (3+ consecutive misses)`);
}
await ctx.heartbeat();
// ============================================================
// STEP 7: Download images for new products
// ============================================================
if (upsertResult.productsNeedingImages.length > 0) {
console.log(`[ProductResync] Downloading images for ${upsertResult.productsNeedingImages.length} products...`);
try {
const dispensaryContext = {
stateCode: dispensary.state || 'AZ',
storeSlug: cName,
};
await downloadProductImages(
pool,
upsertResult.productsNeedingImages,
dispensaryContext
);
} catch (imgError: any) {
// Image download errors shouldn't fail the whole task
console.warn(`[ProductResync] Image download error (non-fatal): ${imgError.message}`);
}
}
// ============================================================
// STEP 8: Update dispensary last_crawl_at
// ============================================================
await pool.query(`
UPDATE dispensaries
SET last_crawl_at = NOW()
WHERE id = $1
`, [dispensaryId]);
console.log(`[ProductResync] Completed ${dispensary.name}`);
return {
success: true,
productsProcessed: normalizationResult.products.length,
snapshotsCreated: snapshotsResult.created,
newProducts: upsertResult.new,
updatedProducts: upsertResult.updated,
markedOos: markedOosCount,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[ProductResync] Error for dispensary ${dispensaryId}:`, errorMessage);
return {
success: false,
error: errorMessage,
};
}
}