From c215d11a84ee718c105ff03458158ea54406623d Mon Sep 17 00:00:00 2001 From: Kelly Date: Sat, 13 Dec 2025 15:16:48 -0700 Subject: [PATCH] feat: Platform isolation, Evomi geo-targeting, proxy management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform isolation: - Rename handlers to {task}-{platform}.ts convention - Deprecate -curl variants (now _deprecated-*) - Platform-based routing in task-worker.ts - Add Jane platform handlers and client Evomi geo-targeting: - Add dynamic proxy URL builder with state/city targeting - Session stickiness per worker per state (30 min) - Fallback to static proxy table when API unavailable - Add proxy tracking columns to worker_tasks Proxy management: - New /proxies admin page for visibility - Track proxy_ip, proxy_geo, proxy_source per task - Show active sessions and task history Validation filtering: - Filter by validated stores (platform_dispensary_id + menu_url) - Mark incomplete stores as deprecated - Update all dashboard/stats queries 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/migrations/107_proxy_tracking.sql | 23 + backend/src/discovery/promotion.ts | 3 + backend/src/hydration/normalizers/jane.ts | 250 ++++++++ backend/src/index.ts | 5 + backend/src/platforms/jane/client.ts | 545 ++++++++++++++++++ backend/src/platforms/jane/index.ts | 48 ++ backend/src/platforms/jane/queries.ts | 292 ++++++++++ backend/src/routes/analytics.ts | 5 +- backend/src/routes/dispensaries.ts | 18 +- backend/src/routes/markets.ts | 4 +- backend/src/routes/orchestrator-admin.ts | 7 +- backend/src/routes/pipeline.ts | 13 +- backend/src/routes/proxy-admin.ts | 173 ++++++ backend/src/services/crawl-rotator.ts | 162 ++++++ backend/src/services/task-scheduler.ts | 67 ++- ...l.ts => _deprecated-payload-fetch-curl.ts} | 8 + ... => _deprecated-product-discovery-curl.ts} | 10 +- ...ts => _deprecated-store-discovery-curl.ts} | 8 + .../handlers/entry-point-discovery-jane.ts | 162 ++++++ backend/src/tasks/handlers/index.ts | 25 +- ...y-http.ts => product-discovery-dutchie.ts} | 8 +- .../tasks/handlers/product-discovery-jane.ts | 168 ++++++ backend/src/tasks/handlers/product-refresh.ts | 42 +- ...ery-http.ts => store-discovery-dutchie.ts} | 7 +- .../tasks/handlers/store-discovery-jane.ts | 166 ++++++ backend/src/tasks/index.ts | 11 +- backend/src/tasks/task-worker.ts | 89 +-- backend/src/utils/payload-storage.ts | 33 +- cannaiq/src/App.tsx | 3 + cannaiq/src/pages/ProxyManagement.tsx | 436 ++++++++++++++ cannaiq/src/pages/WorkersDashboard.tsx | 36 +- 31 files changed, 2698 insertions(+), 129 deletions(-) create mode 100644 backend/migrations/107_proxy_tracking.sql create mode 100644 backend/src/hydration/normalizers/jane.ts create mode 100644 backend/src/platforms/jane/client.ts create mode 100644 backend/src/platforms/jane/index.ts create mode 100644 backend/src/platforms/jane/queries.ts create mode 100644 backend/src/routes/proxy-admin.ts rename backend/src/tasks/handlers/{payload-fetch-curl.ts => _deprecated-payload-fetch-curl.ts} (96%) rename backend/src/tasks/handlers/{product-discovery-curl.ts => _deprecated-product-discovery-curl.ts} (76%) rename backend/src/tasks/handlers/{store-discovery.ts => _deprecated-store-discovery-curl.ts} (92%) create mode 100644 backend/src/tasks/handlers/entry-point-discovery-jane.ts rename backend/src/tasks/handlers/{product-discovery-http.ts => product-discovery-dutchie.ts} (98%) create mode 100644 backend/src/tasks/handlers/product-discovery-jane.ts rename backend/src/tasks/handlers/{store-discovery-http.ts => store-discovery-dutchie.ts} (99%) create mode 100644 backend/src/tasks/handlers/store-discovery-jane.ts create mode 100644 cannaiq/src/pages/ProxyManagement.tsx diff --git a/backend/migrations/107_proxy_tracking.sql b/backend/migrations/107_proxy_tracking.sql new file mode 100644 index 00000000..656bd5f4 --- /dev/null +++ b/backend/migrations/107_proxy_tracking.sql @@ -0,0 +1,23 @@ +-- Migration: 107_proxy_tracking.sql +-- Description: Add proxy tracking columns to worker_tasks for geo-targeting visibility +-- Created: 2025-12-13 + +-- Add proxy tracking columns to worker_tasks +ALTER TABLE worker_tasks +ADD COLUMN IF NOT EXISTS proxy_ip VARCHAR(45); + +ALTER TABLE worker_tasks +ADD COLUMN IF NOT EXISTS proxy_geo VARCHAR(100); + +ALTER TABLE worker_tasks +ADD COLUMN IF NOT EXISTS proxy_source VARCHAR(10); + +-- Comments +COMMENT ON COLUMN worker_tasks.proxy_ip IS 'IP address of proxy used for this task'; +COMMENT ON COLUMN worker_tasks.proxy_geo IS 'Geo target used (e.g., "arizona", "phoenix, arizona")'; +COMMENT ON COLUMN worker_tasks.proxy_source IS 'Source of proxy: "api" (Evomi dynamic) or "static" (fallback table)'; + +-- Index for proxy analysis +CREATE INDEX IF NOT EXISTS idx_worker_tasks_proxy_ip + ON worker_tasks(proxy_ip) + WHERE proxy_ip IS NOT NULL; diff --git a/backend/src/discovery/promotion.ts b/backend/src/discovery/promotion.ts index ed3cb635..5bfedbe7 100644 --- a/backend/src/discovery/promotion.ts +++ b/backend/src/discovery/promotion.ts @@ -388,6 +388,9 @@ async function promoteLocation( country = EXCLUDED.country, status = EXCLUDED.status, dutchie_discovery_id = EXCLUDED.dutchie_discovery_id, + dutchie_verified = TRUE, + dutchie_verified_at = COALESCE(dispensaries.dutchie_verified_at, CURRENT_TIMESTAMP), + crawl_enabled = COALESCE(dispensaries.crawl_enabled, TRUE), last_modified_at = EXCLUDED.last_modified_at, last_modified_by_task = EXCLUDED.last_modified_by_task, last_modified_task_id = EXCLUDED.last_modified_task_id, diff --git a/backend/src/hydration/normalizers/jane.ts b/backend/src/hydration/normalizers/jane.ts new file mode 100644 index 00000000..4c18cd38 --- /dev/null +++ b/backend/src/hydration/normalizers/jane.ts @@ -0,0 +1,250 @@ +/** + * iHeartJane Platform Normalizer + * + * Normalizes raw Jane/Algolia product responses to canonical format. + * + * Jane uses Algolia for product search. Key differences from Dutchie: + * - Product ID is numeric (not MongoDB ObjectId) + * - Prices are per-weight (price_gram, price_eighth_ounce, etc.) + * - Category = strain type (hybrid, indica, sativa) + * - Kind = product type (vape, flower, edible, etc.) + */ + +import { BaseNormalizer } from './base'; +import { + NormalizedProduct, + NormalizedPricing, + NormalizedAvailability, + NormalizedBrand, + NormalizedCategory, +} from '../types'; + +export class JaneNormalizer extends BaseNormalizer { + readonly platform = 'jane'; + readonly supportedVersions = [1]; + + // ============================================================ + // EXTRACTION + // ============================================================ + + extractProducts(rawJson: any): any[] { + // Algolia response format: { hits: [...] } + if (rawJson?.hits && Array.isArray(rawJson.hits)) { + return rawJson.hits; + } + + // Direct array of products + if (Array.isArray(rawJson)) { + return rawJson; + } + + // Products array wrapper + if (rawJson?.products && Array.isArray(rawJson.products)) { + return rawJson.products; + } + + // Try data.hits (nested response) + if (rawJson?.data?.hits && Array.isArray(rawJson.data.hits)) { + return rawJson.data.hits; + } + + console.warn('[JaneNormalizer] Could not extract products from payload'); + return []; + } + + validatePayload(rawJson: any): { valid: boolean; errors: string[] } { + const errors: string[] = []; + + if (!rawJson) { + errors.push('Payload is null or undefined'); + return { valid: false, errors }; + } + + const products = this.extractProducts(rawJson); + if (products.length === 0) { + errors.push('No products found in payload'); + } + + // Check for Algolia errors + if (rawJson?.message) { + errors.push(`Algolia error: ${rawJson.message}`); + } + + return { valid: errors.length === 0, errors }; + } + + // ============================================================ + // NORMALIZATION + // ============================================================ + + protected normalizeProduct(rawProduct: any, dispensaryId: number): NormalizedProduct | null { + const externalId = rawProduct.product_id || rawProduct.objectID; + if (!externalId) { + console.warn('[JaneNormalizer] Product missing ID, skipping'); + return null; + } + + const name = rawProduct.name; + if (!name) { + console.warn(`[JaneNormalizer] Product ${externalId} missing name, skipping`); + return null; + } + + return { + externalProductId: String(externalId), + dispensaryId, + platform: 'jane', + platformDispensaryId: '', // Will be set by handler + + // Core fields + name, + brandName: rawProduct.brand || null, + brandId: rawProduct.product_brand_id ? String(rawProduct.product_brand_id) : null, + category: rawProduct.kind || null, // Jane's "kind" = product type (vape, flower, etc.) + subcategory: rawProduct.kind_subtype || rawProduct.root_subtype || null, + type: rawProduct.kind || null, + strainType: rawProduct.category || null, // Jane's "category" = strain (hybrid, indica, sativa) + + // Potency + thcPercent: rawProduct.percent_thc ?? null, + cbdPercent: rawProduct.percent_cbd ?? null, + thcContent: rawProduct.percent_thc ?? null, + cbdContent: rawProduct.percent_cbd ?? null, + + // Status - Jane products in search are always active + status: 'Active', + isActive: true, + medicalOnly: rawProduct.store_types?.includes('medical') && !rawProduct.store_types?.includes('recreational'), + recOnly: rawProduct.store_types?.includes('recreational') && !rawProduct.store_types?.includes('medical'), + + // Images + primaryImageUrl: rawProduct.image_urls?.[0] || null, + images: (rawProduct.image_urls || []).map((url: string, i: number) => ({ + url, + position: i, + })), + + // Raw reference + rawProduct, + }; + } + + protected normalizePricing(rawProduct: any): NormalizedPricing | null { + const externalId = rawProduct.product_id || rawProduct.objectID; + if (!externalId) return null; + + // Jane has multiple price fields by weight + const prices: number[] = []; + const specialPrices: number[] = []; + + // Collect all regular prices + if (rawProduct.price_gram) prices.push(rawProduct.price_gram); + if (rawProduct.price_each) prices.push(rawProduct.price_each); + if (rawProduct.price_half_gram) prices.push(rawProduct.price_half_gram); + if (rawProduct.price_eighth_ounce) prices.push(rawProduct.price_eighth_ounce); + if (rawProduct.price_quarter_ounce) prices.push(rawProduct.price_quarter_ounce); + if (rawProduct.price_half_ounce) prices.push(rawProduct.price_half_ounce); + if (rawProduct.price_ounce) prices.push(rawProduct.price_ounce); + if (rawProduct.price_two_gram) prices.push(rawProduct.price_two_gram); + + // Collect special/discounted prices + if (rawProduct.special_price_gram) specialPrices.push(rawProduct.special_price_gram); + if (rawProduct.special_price_each) specialPrices.push(rawProduct.special_price_each); + if (rawProduct.discounted_price_gram) specialPrices.push(rawProduct.discounted_price_gram); + if (rawProduct.discounted_price_each) specialPrices.push(rawProduct.discounted_price_each); + + // Also check bucket_price and sort_price + if (rawProduct.bucket_price && !prices.includes(rawProduct.bucket_price)) { + prices.push(rawProduct.bucket_price); + } + + // Determine if on special + const isOnSpecial = specialPrices.length > 0 || rawProduct.has_brand_discount === true; + + // Calculate discount percent + let discountPercent: number | null = null; + if (isOnSpecial && prices.length > 0 && specialPrices.length > 0) { + const regularMin = Math.min(...prices); + const specialMin = Math.min(...specialPrices); + if (regularMin > 0 && specialMin < regularMin) { + discountPercent = Math.round(((regularMin - specialMin) / regularMin) * 100); + } + } + + // Get special name from brand_special_prices + let specialName: string | null = null; + if (rawProduct.brand_special_prices) { + const firstSpecial = Object.values(rawProduct.brand_special_prices)[0] as any; + if (firstSpecial?.title) { + specialName = firstSpecial.title; + } + } + + return { + externalProductId: String(externalId), + + // Use minimum price for display + priceRec: this.toCents(this.getMin(prices)), + priceRecMin: this.toCents(this.getMin(prices)), + priceRecMax: this.toCents(this.getMax(prices)), + priceRecSpecial: this.toCents(this.getMin(specialPrices)), + + // Jane doesn't distinguish med pricing in Algolia response + priceMed: null, + priceMedMin: null, + priceMedMax: null, + priceMedSpecial: null, + + isOnSpecial, + specialName, + discountPercent, + }; + } + + protected normalizeAvailability(rawProduct: any): NormalizedAvailability | null { + const externalId = rawProduct.product_id || rawProduct.objectID; + if (!externalId) return null; + + // Jane products in Algolia are in-stock (OOS products aren't returned) + const availableForPickup = rawProduct.available_for_pickup ?? true; + const availableForDelivery = rawProduct.available_for_delivery ?? false; + const inStock = availableForPickup || availableForDelivery; + + // Jane doesn't expose quantity in Algolia + const quantity = rawProduct.max_cart_quantity || null; + + return { + externalProductId: String(externalId), + inStock, + stockStatus: inStock ? 'in_stock' : 'out_of_stock', + quantity, + quantityAvailable: quantity, + isBelowThreshold: false, // Jane doesn't expose this + optionsBelowThreshold: false, + }; + } + + protected extractBrand(rawProduct: any): NormalizedBrand | null { + const brandName = rawProduct.brand; + if (!brandName) return null; + + return { + externalBrandId: rawProduct.product_brand_id ? String(rawProduct.product_brand_id) : null, + name: brandName, + slug: this.slugify(brandName), + logoUrl: rawProduct.brand_logo_url || null, + }; + } + + protected extractCategory(rawProduct: any): NormalizedCategory | null { + // Use "kind" as the primary category (vape, flower, edible, etc.) + const categoryName = rawProduct.kind; + if (!categoryName) return null; + + return { + name: categoryName, + slug: this.slugify(categoryName), + parentCategory: null, + }; + } +} diff --git a/backend/src/index.ts b/backend/src/index.ts index 366d29f1..9a238810 100755 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -90,6 +90,7 @@ import publicApiRoutes from './routes/public-api'; import usersRoutes from './routes/users'; import staleProcessesRoutes from './routes/stale-processes'; import orchestratorAdminRoutes from './routes/orchestrator-admin'; +import proxyAdminRoutes from './routes/proxy-admin'; import adminDebugRoutes from './routes/admin-debug'; import intelligenceRoutes from './routes/intelligence'; import marketsRoutes from './routes/markets'; @@ -172,6 +173,10 @@ app.use('/api/stale-processes', staleProcessesRoutes); // Admin routes - orchestrator actions app.use('/api/admin/orchestrator', orchestratorAdminRoutes); +// Admin routes - proxy management (geo-targeting, sessions, fallback) +app.use('/api/admin/proxies', proxyAdminRoutes); +console.log('[ProxyAdmin] Routes registered at /api/admin/proxies'); + // Admin routes - debug endpoints (snapshot inspection) app.use('/api/admin/debug', adminDebugRoutes); console.log('[AdminDebug] Routes registered at /api/admin/debug'); diff --git a/backend/src/platforms/jane/client.ts b/backend/src/platforms/jane/client.ts new file mode 100644 index 00000000..36bc4497 --- /dev/null +++ b/backend/src/platforms/jane/client.ts @@ -0,0 +1,545 @@ +/** + * ============================================================ + * iHEARTJANE PLATFORM CLIENT + * ============================================================ + * + * Jane uses Cloudflare protection, so all requests must go through + * Puppeteer with stealth plugin. This client manages browser sessions + * and network interception to capture API responses. + * + * API Endpoints: + * - Store: GET https://api.iheartjane.com/v1/stores/{store_id} + * - Products: POST https://search.iheartjane.com/1/indexes/menu-products-production/query (Algolia) + * + * Store ID Format: Numeric (e.g., "2788") + * + * ============================================================ + */ + +import puppeteer, { Browser, Page } from 'puppeteer'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import puppeteerExtra from 'puppeteer-extra'; + +import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator'; + +// Register stealth plugin +puppeteerExtra.use(StealthPlugin()); + +// ============================================================ +// TYPES +// ============================================================ + +export interface JaneStoreData { + id: number; + name: string; + address: string; + city: string; + state: string; + zip: string; + lat: number; + long: number; + phone: string; + medical: boolean; + recreational: boolean; + product_count: number; + rating: number; + reviews_count: number; + working_hours: Array<{ day: string; period: { from: string; to: string } }>; + url_slug: string; + photo?: string; + // Full raw data preserved + raw: any; +} + +export interface JaneProductHit { + product_id: number; + name: string; + brand: string; + kind: string; + category: string; + percent_thc: number | null; + percent_cbd: number | null; + price_gram: number | null; + price_each: number | null; + price_eighth_ounce: number | null; + price_quarter_ounce: number | null; + price_half_ounce: number | null; + price_ounce: number | null; + image_urls: string[]; + aggregate_rating: number | null; + review_count: number | null; + available_for_pickup: boolean; + available_for_delivery: boolean; + // Full raw data preserved + raw: any; +} + +export interface JaneSession { + sessionId: string; + browser: Browser; + page: Page; + fingerprint: BrowserFingerprint; + proxyUrl: string | null; + startedAt: Date; + storeId?: string; +} + +export interface CapturedResponse { + type: 'store' | 'product' | 'algolia' | 'api'; + url: string; + data: any; +} + +// ============================================================ +// CONFIGURATION +// ============================================================ + +export const JANE_CONFIG = { + storeApiBase: 'https://api.iheartjane.com/v1', + algoliaEndpoint: 'https://search.iheartjane.com/1/indexes/menu-products-production/query', + timeout: 60000, + maxRetries: 3, + navigationTimeout: 60000, + productFetchDelay: 1000, +}; + +// ============================================================ +// SESSION MANAGEMENT +// ============================================================ + +let currentSession: JaneSession | null = null; +let crawlRotator: CrawlRotator | null = null; + +/** + * Set CrawlRotator for proxy/fingerprint management + */ +export function setCrawlRotator(rotator: CrawlRotator | null): void { + crawlRotator = rotator; + if (rotator) { + console.log('[Jane Client] CrawlRotator attached'); + } +} + +/** + * Get attached CrawlRotator + */ +export function getCrawlRotator(): CrawlRotator | null { + return crawlRotator; +} + +/** + * Start a new Jane browser session + * Uses Puppeteer + Stealth plugin to bypass Cloudflare + */ +export async function startSession(storeId?: string): Promise { + if (currentSession) { + console.log('[Jane Client] Closing existing session before starting new one'); + await endSession(); + } + + // Get fingerprint from rotator or use defaults + let fingerprint: BrowserFingerprint; + let proxyUrl: string | null = null; + + if (crawlRotator) { + fingerprint = crawlRotator.userAgent.getCurrent(); + const proxy = crawlRotator.proxy.getCurrent(); + if (proxy) { + proxyUrl = crawlRotator.proxy.getProxyUrl(proxy); + } + } else { + // Default fingerprint for local testing + fingerprint = { + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + browserName: 'Chrome', + deviceCategory: 'desktop', + platform: 'Windows', + screenWidth: 1920, + screenHeight: 1080, + viewportWidth: 1920, + viewportHeight: 1080, + acceptLanguage: 'en-US,en;q=0.9', + secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', + secChUaPlatform: '"Windows"', + secChUaMobile: '?0', + httpFingerprint: { + browserType: 'Chrome' as const, + headers: {}, + headerOrder: [], + curlImpersonateBinary: 'curl_chrome131', + hasDNT: false, + }, + }; + } + + // Build browser args + const browserArgs = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ]; + + if (proxyUrl) { + // Extract host:port from proxy URL for puppeteer + const proxyMatch = proxyUrl.match(/:\/\/([^@]+@)?([^/]+)/); + if (proxyMatch) { + browserArgs.push(`--proxy-server=${proxyMatch[2]}`); + } + } + + console.log('[Jane Client] Launching browser...'); + const browser = await puppeteerExtra.launch({ + headless: true, + args: browserArgs, + }); + + const page = await browser.newPage(); + + // Set viewport + await page.setViewport({ + width: fingerprint.viewportWidth || 1920, + height: fingerprint.viewportHeight || 1080, + }); + + // Set user agent + await page.setUserAgent(fingerprint.userAgent); + + // Handle proxy authentication if needed + if (proxyUrl) { + const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/); + if (authMatch) { + await page.authenticate({ + username: authMatch[1], + password: authMatch[2], + }); + } + } + + const sessionId = `jane_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`; + + currentSession = { + sessionId, + browser, + page, + fingerprint, + proxyUrl, + startedAt: new Date(), + storeId, + }; + + console.log(`[Jane Client] Started session ${sessionId}`); + console.log(`[Jane Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`); + if (proxyUrl) { + console.log(`[Jane Client] Proxy: ${proxyUrl.replace(/:[^:@]+@/, ':***@')}`); + } + + return currentSession; +} + +/** + * End the current browser session + */ +export async function endSession(): Promise { + if (currentSession) { + const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000); + console.log(`[Jane Client] Ending session ${currentSession.sessionId} (${duration}s)`); + + try { + await currentSession.browser.close(); + } catch (e) { + console.warn('[Jane Client] Error closing browser:', e); + } + + currentSession = null; + } +} + +/** + * Get current active session + */ +export function getCurrentSession(): JaneSession | null { + return currentSession; +} + +// ============================================================ +// NETWORK INTERCEPTION +// ============================================================ + +/** + * Navigate to a Jane menu page and capture API responses + * Returns captured store and product data from network interception + */ +export async function navigateAndCapture( + menuUrl: string, + options: { waitForProducts?: boolean; captureStore?: boolean } = {} +): Promise<{ store?: JaneStoreData; products: JaneProductHit[]; responses: CapturedResponse[] }> { + const { waitForProducts = true, captureStore = true } = options; + + if (!currentSession) { + throw new Error('[Jane Client] No active session - call startSession() first'); + } + + const { page } = currentSession; + const capturedResponses: CapturedResponse[] = []; + let storeData: JaneStoreData | undefined; + const products: JaneProductHit[] = []; + + // Enable request interception + await page.setRequestInterception(true); + + // Block heavy resources to speed up page load + page.on('request', (req) => { + const type = req.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + // Capture API responses + page.on('response', async (response) => { + const url = response.url(); + const contentType = response.headers()['content-type'] || ''; + + // Only process JSON responses from Jane domains + if ((url.includes('iheartjane.com') || url.includes('algolia')) && contentType.includes('json')) { + try { + const json = await response.json(); + + // Categorize response + if (url.includes('/stores/')) { + capturedResponses.push({ type: 'store', url, data: json }); + + if (captureStore && json.store) { + storeData = parseStoreData(json.store); + console.log(`[Jane Client] Captured store: ${storeData.name} (ID: ${storeData.id})`); + } + } else if (url.includes('algolia') || url.includes('search.iheartjane')) { + capturedResponses.push({ type: 'algolia', url, data: json }); + + if (json.hits && Array.isArray(json.hits)) { + const newProducts = json.hits.map(parseProductHit); + products.push(...newProducts); + console.log(`[Jane Client] Captured ${json.hits.length} products from Algolia`); + } + } else { + capturedResponses.push({ type: 'api', url, data: json }); + } + } catch { + // Not valid JSON, skip + } + } + }); + + console.log(`[Jane Client] Navigating to ${menuUrl}`); + + try { + await page.goto(menuUrl, { + waitUntil: 'networkidle2', + timeout: JANE_CONFIG.navigationTimeout, + }); + + // Wait for products to load + if (waitForProducts) { + console.log('[Jane Client] Waiting for product data...'); + await sleep(3000); + + // Try to wait for product selector + try { + await page.waitForSelector('[data-testid="product-card"], .product-card, [class*="ProductCard"]', { + timeout: 10000, + }); + } catch { + console.log('[Jane Client] No product cards found via selector (may have loaded via API)'); + } + } + } catch (error: any) { + console.error(`[Jane Client] Navigation error: ${error.message}`); + throw error; + } + + // Record success if we got data + if (crawlRotator && (storeData || products.length > 0)) { + await crawlRotator.recordSuccess(); + } + + return { store: storeData, products, responses: capturedResponses }; +} + +/** + * Fetch store info directly via browser context + * Uses page.evaluate to make fetch request from browser (bypasses Cloudflare) + */ +export async function fetchStoreInfo(storeId: string | number): Promise { + if (!currentSession) { + throw new Error('[Jane Client] No active session - call startSession() first'); + } + + const { page } = currentSession; + const url = `${JANE_CONFIG.storeApiBase}/stores/${storeId}`; + + console.log(`[Jane Client] Fetching store info: ${url}`); + + try { + const result = await page.evaluate(async (apiUrl: string) => { + try { + const resp = await fetch(apiUrl); + if (resp.ok) { + return await resp.json(); + } + return { error: resp.status }; + } catch (e: any) { + return { error: e.message }; + } + }, url); + + if (result.error) { + console.error(`[Jane Client] Store fetch failed: ${result.error}`); + return null; + } + + if (result.store) { + const storeData = parseStoreData(result.store); + console.log(`[Jane Client] Got store: ${storeData.name}`); + return storeData; + } + + return null; + } catch (error: any) { + console.error(`[Jane Client] Error fetching store: ${error.message}`); + return null; + } +} + +/** + * Extract store ID from a Jane menu URL + * Jane URLs don't always contain the store ID directly - we may need to intercept it + */ +export async function extractStoreIdFromUrl(menuUrl: string): Promise { + if (!currentSession) { + throw new Error('[Jane Client] No active session - call startSession() first'); + } + + // Try to extract from URL pattern first (if embedded) + const urlMatch = menuUrl.match(/store[s]?[\/=](\d+)/i); + if (urlMatch) { + return urlMatch[1]; + } + + // Otherwise, navigate and intercept the store API call + console.log('[Jane Client] Store ID not in URL, intercepting from navigation...'); + + const { page } = currentSession; + let capturedStoreId: string | null = null; + + // Listen for store API calls + const storeIdPromise = new Promise((resolve) => { + const timeout = setTimeout(() => resolve(null), 30000); + + page.on('response', async (response) => { + const url = response.url(); + if (url.includes('/v1/stores/') && !capturedStoreId) { + const match = url.match(/\/stores\/(\d+)/); + if (match) { + capturedStoreId = match[1]; + clearTimeout(timeout); + resolve(capturedStoreId); + } + } + }); + }); + + // Enable interception and navigate + await page.setRequestInterception(true); + page.on('request', (req) => { + const type = req.resourceType(); + if (['image', 'font', 'media'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + await page.goto(menuUrl, { + waitUntil: 'domcontentloaded', + timeout: JANE_CONFIG.navigationTimeout, + }); + + // Wait for store ID to be captured + const storeId = await storeIdPromise; + + if (storeId) { + console.log(`[Jane Client] Extracted store ID: ${storeId}`); + } else { + console.warn('[Jane Client] Could not extract store ID from URL'); + } + + return storeId; +} + +// ============================================================ +// DATA PARSING +// ============================================================ + +/** + * Parse raw store data into normalized structure + */ +function parseStoreData(raw: any): JaneStoreData { + return { + id: raw.id, + name: raw.name, + address: raw.address || '', + city: raw.city || '', + state: raw.state || '', + zip: raw.zip || '', + lat: raw.lat || 0, + long: raw.long || 0, + phone: raw.phone || '', + medical: raw.medical || false, + recreational: raw.recreational || false, + product_count: raw.product_count || 0, + rating: raw.rating || 0, + reviews_count: raw.reviews_count || 0, + working_hours: raw.working_hours || [], + url_slug: raw.url_slug || '', + photo: raw.photo, + raw, + }; +} + +/** + * Parse raw Algolia product hit into normalized structure + */ +function parseProductHit(hit: any): JaneProductHit { + return { + product_id: hit.product_id, + name: hit.name, + brand: hit.brand, + kind: hit.kind, + category: hit.category, + percent_thc: hit.percent_thc ?? null, + percent_cbd: hit.percent_cbd ?? null, + price_gram: hit.price_gram ?? null, + price_each: hit.price_each ?? null, + price_eighth_ounce: hit.price_eighth_ounce ?? null, + price_quarter_ounce: hit.price_quarter_ounce ?? null, + price_half_ounce: hit.price_half_ounce ?? null, + price_ounce: hit.price_ounce ?? null, + image_urls: hit.image_urls || [], + aggregate_rating: hit.aggregate_rating ?? null, + review_count: hit.review_count ?? null, + available_for_pickup: hit.available_for_pickup ?? false, + available_for_delivery: hit.available_for_delivery ?? false, + raw: hit, + }; +} + +// ============================================================ +// UTILITY +// ============================================================ + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/backend/src/platforms/jane/index.ts b/backend/src/platforms/jane/index.ts new file mode 100644 index 00000000..787ddcc7 --- /dev/null +++ b/backend/src/platforms/jane/index.ts @@ -0,0 +1,48 @@ +/** + * iHeartJane Platform Module + * + * Single export point for all Jane communication. + * All Jane workers MUST import from this module. + */ + +export { + // Session Management + startSession, + endSession, + getCurrentSession, + + // Proxy/Rotation + setCrawlRotator, + getCrawlRotator, + + // Core Operations + navigateAndCapture, + fetchStoreInfo, + extractStoreIdFromUrl, + + // Configuration + JANE_CONFIG, + + // Types + type JaneSession, + type JaneStoreData, + type JaneProductHit, + type CapturedResponse, +} from './client'; + +// High-level Query Functions +export { + resolveStoreFromUrl, + getStoreById, + fetchProductsFromUrl, + fetchProductsByStoreId, + discoverStoresByState, + + // Types + type ResolveStoreResult, + type FetchProductsResult, + type DiscoveredStore, +} from './queries'; + +// Re-export CrawlRotator types from canonical location +export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator'; diff --git a/backend/src/platforms/jane/queries.ts b/backend/src/platforms/jane/queries.ts new file mode 100644 index 00000000..603151da --- /dev/null +++ b/backend/src/platforms/jane/queries.ts @@ -0,0 +1,292 @@ +/** + * iHeartJane High-Level Query Functions + * + * Wraps the low-level client methods with business logic + * for common operations like store lookup and product fetching. + */ + +import { + startSession, + endSession, + navigateAndCapture, + fetchStoreInfo, + extractStoreIdFromUrl, + JaneStoreData, + JaneProductHit, + CapturedResponse, +} from './client'; + +// ============================================================ +// STORE OPERATIONS +// ============================================================ + +export interface ResolveStoreResult { + storeId: string; + store: JaneStoreData; +} + +/** + * Resolve a menu URL to a Jane store ID and fetch store details + * + * @param menuUrl - The dispensary's menu URL (e.g., https://theflowershopusa.com/mesa/menu/) + * @returns Store ID and store data, or null if not found + */ +export async function resolveStoreFromUrl(menuUrl: string): Promise { + try { + await startSession(); + + // Navigate and capture store data + const { store, responses } = await navigateAndCapture(menuUrl, { + waitForProducts: false, + captureStore: true, + }); + + if (store) { + return { + storeId: String(store.id), + store, + }; + } + + // If store wasn't captured from navigation, try to extract ID and fetch directly + const storeId = await extractStoreIdFromUrl(menuUrl); + if (storeId) { + const storeData = await fetchStoreInfo(storeId); + if (storeData) { + return { + storeId, + store: storeData, + }; + } + } + + // Check if we captured any store response + const storeResponse = responses.find((r) => r.type === 'store'); + if (storeResponse?.data?.store) { + const storeData = storeResponse.data.store; + return { + storeId: String(storeData.id), + store: storeData, + }; + } + + return null; + } finally { + await endSession(); + } +} + +/** + * Get store information by store ID + * + * @param storeId - Jane store ID (numeric string) + * @returns Store data or null if not found + */ +export async function getStoreById(storeId: string | number): Promise { + try { + await startSession(String(storeId)); + + // Need to navigate to a Jane page first to establish browser context + // Use the Jane stores page as a base + const { page } = (await import('./client')).getCurrentSession()!; + await page.goto('https://www.iheartjane.com/stores', { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + + // Now fetch store info + return await fetchStoreInfo(storeId); + } finally { + await endSession(); + } +} + +// ============================================================ +// PRODUCT OPERATIONS +// ============================================================ + +export interface FetchProductsResult { + store?: JaneStoreData; + products: JaneProductHit[]; + totalCaptured: number; + responses: CapturedResponse[]; +} + +/** + * Fetch all products from a Jane menu page + * + * @param menuUrl - The dispensary's menu URL + * @returns Products and store data captured from the page + */ +export async function fetchProductsFromUrl(menuUrl: string): Promise { + try { + await startSession(); + + const { store, products, responses } = await navigateAndCapture(menuUrl, { + waitForProducts: true, + captureStore: true, + }); + + return { + store, + products, + totalCaptured: products.length, + responses, + }; + } finally { + await endSession(); + } +} + +/** + * Fetch products for a known store ID + * Constructs a Jane menu URL and fetches products + * + * @param storeId - Jane store ID + * @param urlSlug - Optional URL slug for the store + * @returns Products and store data + */ +export async function fetchProductsByStoreId( + storeId: string | number, + urlSlug?: string +): Promise { + // Construct a Jane embedded menu URL + // Jane stores can be accessed via: https://www.iheartjane.com/stores/{store_id}/{url_slug} + const slug = urlSlug || 'menu'; + const menuUrl = `https://www.iheartjane.com/stores/${storeId}/${slug}`; + + return fetchProductsFromUrl(menuUrl); +} + +// ============================================================ +// DISCOVERY OPERATIONS +// ============================================================ + +export interface DiscoveredStore { + storeId: string; + name: string; + address: string; + city: string; + state: string; + zip: string; + lat: number; + long: number; + medical: boolean; + recreational: boolean; + productCount: number; + urlSlug: string; +} + +/** + * Discover Jane stores in a state + * Navigates to Jane's store locator and extracts store data + * + * @param stateCode - Two-letter state code (e.g., 'AZ') + * @returns Array of discovered stores + */ +export async function discoverStoresByState(stateCode: string): Promise { + const stores: DiscoveredStore[] = []; + + try { + await startSession(); + + const { page } = (await import('./client')).getCurrentSession()!; + + // Jane has a store directory at /stores + // Try state-specific URL first + const storeListUrl = `https://www.iheartjane.com/stores?state=${stateCode}`; + + console.log(`[Jane Queries] Discovering stores in ${stateCode}: ${storeListUrl}`); + + await page.setRequestInterception(true); + + // Capture store list responses + const storeResponses: any[] = []; + + page.on('request', (req) => { + const type = req.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + page.on('response', async (response) => { + const url = response.url(); + const contentType = response.headers()['content-type'] || ''; + + if (url.includes('iheartjane.com') && contentType.includes('json')) { + try { + const json = await response.json(); + if (json.stores && Array.isArray(json.stores)) { + storeResponses.push(...json.stores); + console.log(`[Jane Queries] Captured ${json.stores.length} stores from API`); + } + } catch { + // Not valid JSON + } + } + }); + + await page.goto(storeListUrl, { + waitUntil: 'networkidle2', + timeout: 60000, + }); + + // Wait for stores to load + await new Promise((r) => setTimeout(r, 3000)); + + // Parse captured stores + for (const store of storeResponses) { + // Filter by state + if (store.state?.toLowerCase() === stateCode.toLowerCase() || + store.state?.toLowerCase() === getStateName(stateCode).toLowerCase()) { + stores.push({ + storeId: String(store.id), + name: store.name || '', + address: store.address || '', + city: store.city || '', + state: store.state || stateCode, + zip: store.zip || '', + lat: store.lat || 0, + long: store.long || 0, + medical: store.medical || false, + recreational: store.recreational || false, + productCount: store.product_count || 0, + urlSlug: store.url_slug || '', + }); + } + } + + console.log(`[Jane Queries] Found ${stores.length} stores in ${stateCode}`); + + return stores; + } finally { + await endSession(); + } +} + +// ============================================================ +// UTILITY +// ============================================================ + +/** + * Convert state code to full state name + */ +function getStateName(code: string): string { + const states: Record = { + AL: 'Alabama', AK: 'Alaska', AZ: 'Arizona', AR: 'Arkansas', CA: 'California', + CO: 'Colorado', CT: 'Connecticut', DE: 'Delaware', FL: 'Florida', GA: 'Georgia', + HI: 'Hawaii', ID: 'Idaho', IL: 'Illinois', IN: 'Indiana', IA: 'Iowa', + KS: 'Kansas', KY: 'Kentucky', LA: 'Louisiana', ME: 'Maine', MD: 'Maryland', + MA: 'Massachusetts', MI: 'Michigan', MN: 'Minnesota', MS: 'Mississippi', MO: 'Missouri', + MT: 'Montana', NE: 'Nebraska', NV: 'Nevada', NH: 'New Hampshire', NJ: 'New Jersey', + NM: 'New Mexico', NY: 'New York', NC: 'North Carolina', ND: 'North Dakota', OH: 'Ohio', + OK: 'Oklahoma', OR: 'Oregon', PA: 'Pennsylvania', RI: 'Rhode Island', SC: 'South Carolina', + SD: 'South Dakota', TN: 'Tennessee', TX: 'Texas', UT: 'Utah', VT: 'Vermont', + VA: 'Virginia', WA: 'Washington', WV: 'West Virginia', WI: 'Wisconsin', WY: 'Wyoming', + DC: 'District of Columbia', + }; + return states[code.toUpperCase()] || code; +} diff --git a/backend/src/routes/analytics.ts b/backend/src/routes/analytics.ts index 03fe02ae..6536cc75 100755 --- a/backend/src/routes/analytics.ts +++ b/backend/src/routes/analytics.ts @@ -130,7 +130,7 @@ router.get('/national/summary', async (req, res) => { return res.json(cached); } - // Single optimized query for all state metrics + // Single optimized query for all state metrics (only validated stores) const { rows: stateMetrics } = await pool.query(` SELECT d.state, @@ -146,6 +146,9 @@ router.get('/national/summary', async (req, res) => { LEFT JOIN store_products sp ON d.id = sp.dispensary_id LEFT JOIN states s ON d.state = s.code WHERE d.state IS NOT NULL + AND d.platform_dispensary_id IS NOT NULL + AND d.menu_url IS NOT NULL + AND (d.stage IS NULL OR d.stage != 'deprecated') GROUP BY d.state, s.name ORDER BY store_count DESC `); diff --git a/backend/src/routes/dispensaries.ts b/backend/src/routes/dispensaries.ts index 61f5a771..fa2a5d62 100644 --- a/backend/src/routes/dispensaries.ts +++ b/backend/src/routes/dispensaries.ts @@ -53,6 +53,7 @@ router.get('/', async (req, res) => { last_crawl_at, crawl_enabled, dutchie_verified, + stage, created_at, updated_at FROM dispensaries @@ -79,15 +80,17 @@ router.get('/', async (req, res) => { params.push(state); } - // Filter by crawl_enabled - defaults to showing only enabled + // Filter by crawl_enabled if (crawl_enabled === 'false' || crawl_enabled === '0') { // Explicitly show disabled only conditions.push(`(crawl_enabled = false OR crawl_enabled IS NULL)`); } else if (crawl_enabled === 'all') { - // Show all (no filter) + // Show all including deprecated (no validation filter) } else { - // Default: show only enabled - conditions.push(`crawl_enabled = true`); + // Default: show only validated stores (has location data, not deprecated) + conditions.push(`platform_dispensary_id IS NOT NULL`); + conditions.push(`menu_url IS NOT NULL`); + conditions.push(`(stage IS NULL OR stage != 'deprecated')`); } // Filter by dutchie_verified if provided @@ -140,12 +143,15 @@ router.get('/', async (req, res) => { } }); -// Get menu type stats +// Get menu type stats (only validated stores) router.get('/stats/menu-types', async (req, res) => { try { const result = await pool.query(` SELECT menu_type, COUNT(*) as count FROM dispensaries + WHERE platform_dispensary_id IS NOT NULL + AND menu_url IS NOT NULL + AND (stage IS NULL OR stage != 'deprecated') GROUP BY menu_type ORDER BY count DESC `); @@ -163,6 +169,8 @@ router.get('/stats/crawl-status', async (req, res) => { let query = ` SELECT + COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL AND menu_url IS NOT NULL AND (stage IS NULL OR stage != 'deprecated')) as validated_count, + COUNT(*) FILTER (WHERE stage = 'deprecated') as deprecated_count, COUNT(*) FILTER (WHERE crawl_enabled = true) as enabled_count, COUNT(*) FILTER (WHERE crawl_enabled = false OR crawl_enabled IS NULL) as disabled_count, COUNT(*) FILTER (WHERE dutchie_verified = true) as verified_count, diff --git a/backend/src/routes/markets.ts b/backend/src/routes/markets.ts index 7b59ce79..b5ca1faa 100644 --- a/backend/src/routes/markets.ts +++ b/backend/src/routes/markets.ts @@ -26,10 +26,10 @@ router.get('/dashboard', async (req: Request, res: Response) => { return res.json(dashboardCache.data); } - // Single optimized query for all counts + // Single optimized query for all counts (only validated stores) const { rows } = await pool.query(` SELECT - (SELECT COUNT(*) FROM dispensaries) as dispensary_count, + (SELECT COUNT(*) FROM dispensaries WHERE platform_dispensary_id IS NOT NULL AND menu_url IS NOT NULL AND (stage IS NULL OR stage != 'deprecated')) as dispensary_count, (SELECT n_live_tup FROM pg_stat_user_tables WHERE relname = 'store_products') as product_count, (SELECT COUNT(*) FROM (SELECT DISTINCT brand_name_raw FROM store_products WHERE brand_name_raw IS NOT NULL LIMIT 10000) b) as brand_count, (SELECT COUNT(*) FROM (SELECT DISTINCT category_raw FROM store_products WHERE category_raw IS NOT NULL LIMIT 1000) c) as category_count, diff --git a/backend/src/routes/orchestrator-admin.ts b/backend/src/routes/orchestrator-admin.ts index 80679890..9bffd546 100644 --- a/backend/src/routes/orchestrator-admin.ts +++ b/backend/src/routes/orchestrator-admin.ts @@ -78,14 +78,17 @@ router.get('/metrics', async (_req: Request, res: Response) => { /** * GET /api/admin/orchestrator/states - * Returns array of states with at least one crawl-enabled dispensary + * Returns array of states with at least one validated dispensary */ router.get('/states', async (_req: Request, res: Response) => { try { const { rows } = await pool.query(` SELECT DISTINCT state, COUNT(*) as store_count FROM dispensaries - WHERE state IS NOT NULL AND crawl_enabled = true + WHERE state IS NOT NULL + AND platform_dispensary_id IS NOT NULL + AND menu_url IS NOT NULL + AND (stage IS NULL OR stage != 'deprecated') GROUP BY state ORDER BY state `); diff --git a/backend/src/routes/pipeline.ts b/backend/src/routes/pipeline.ts index fd9c105e..7d045259 100644 --- a/backend/src/routes/pipeline.ts +++ b/backend/src/routes/pipeline.ts @@ -982,19 +982,24 @@ router.get('/stats', async (_req: Request, res: Response) => { GROUP BY stage `); - // Dispensaries by stage + // Dispensaries by stage (only validated stores) const { rows: dispensaryStats } = await pool.query(` SELECT stage, COUNT(*) as count FROM dispensaries - WHERE crawl_enabled = true + WHERE platform_dispensary_id IS NOT NULL + AND menu_url IS NOT NULL + AND (stage IS NULL OR stage != 'deprecated') GROUP BY stage `); - // By state for dispensaries + // By state for dispensaries (only validated stores) const { rows: byState } = await pool.query(` SELECT state, stage, COUNT(*) as count FROM dispensaries - WHERE crawl_enabled = true AND state IS NOT NULL + WHERE state IS NOT NULL + AND platform_dispensary_id IS NOT NULL + AND menu_url IS NOT NULL + AND (stage IS NULL OR stage != 'deprecated') GROUP BY state, stage ORDER BY state, stage `); diff --git a/backend/src/routes/proxy-admin.ts b/backend/src/routes/proxy-admin.ts new file mode 100644 index 00000000..6ee9b40c --- /dev/null +++ b/backend/src/routes/proxy-admin.ts @@ -0,0 +1,173 @@ +import { Router, Request, Response } from 'express'; +import { pool } from '../db/pool'; +import { authMiddleware } from '../auth/middleware'; + +const router = Router(); +router.use(authMiddleware); + +/** + * GET /api/admin/proxies + * Get all static proxies (fallback when API unavailable) + */ +router.get('/', async (_req: Request, res: Response) => { + try { + const { rows } = await pool.query(` + SELECT + id, host, port, protocol, username, + city, state, country, country_code, + active, failure_count, consecutive_403_count, + response_time_ms, last_tested_at + FROM proxies + ORDER BY active DESC, failure_count ASC + `); + + res.json({ proxies: rows }); + } catch (error: any) { + console.error('[ProxyAdmin] Error fetching proxies:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +/** + * GET /api/admin/proxies/sessions + * Get active proxy sessions (geo-locked workers) + */ +router.get('/sessions', async (_req: Request, res: Response) => { + try { + // For now, return worker preflight data as "sessions" + // TODO: Implement proper session tracking table + const { rows } = await pool.query(` + SELECT + worker_id, + http_ip as proxy_ip, + 'US' as proxy_geo, + 'AZ' as state, + preflight_http_at as started_at, + preflight_http_at + interval '30 minutes' as expires_at, + tasks_completed as tasks_completed + FROM worker_registry + WHERE status = 'active' + AND http_ip IS NOT NULL + ORDER BY preflight_http_at DESC + `); + + res.json({ sessions: rows }); + } catch (error: any) { + console.error('[ProxyAdmin] Error fetching sessions:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +/** + * GET /api/admin/proxies/tasks + * Get recent tasks with proxy usage info + */ +router.get('/tasks', async (req: Request, res: Response) => { + try { + const limit = parseInt(req.query.limit as string) || 100; + + const { rows } = await pool.query(` + SELECT + t.id, + t.worker_id, + t.role, + t.status, + t.proxy_ip, + t.proxy_geo, + t.proxy_source, + d.name as dispensary_name, + d.city as dispensary_city, + d.state as dispensary_state, + t.created_at + FROM worker_tasks t + LEFT JOIN dispensaries d ON t.dispensary_id = d.id + ORDER BY t.id DESC + LIMIT $1 + `, [limit]); + + res.json({ tasks: rows }); + } catch (error: any) { + console.error('[ProxyAdmin] Error fetching tasks:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +/** + * GET /api/admin/proxies/evomi-settings + * Get Evomi API configuration status + */ +router.get('/evomi-settings', async (_req: Request, res: Response) => { + try { + const apiKeySet = !!process.env.EVOMI_API_KEY; + + // US state codes for region mapping + const usStates = [ + 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', + 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', + 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', + 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', + 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY' + ]; + + // Major US cities we know are available + const majorCities = [ + 'phoenix', 'tucson', 'los.angeles', 'san.francisco', 'denver', + 'chicago', 'new.york', 'miami', 'seattle', 'portland', + 'las.vegas', 'boston', 'detroit', 'atlanta', 'dallas' + ]; + + res.json({ + enabled: apiKeySet, + api_key_set: apiKeySet, + available_regions: usStates, + available_cities: majorCities, + }); + } catch (error: any) { + console.error('[ProxyAdmin] Error fetching Evomi settings:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +/** + * POST /api/admin/proxies + * Add a static proxy (fallback) + */ +router.post('/', async (req: Request, res: Response) => { + try { + const { host, port, protocol, username, password, city, state, country } = req.body; + + if (!host || !port) { + return res.status(400).json({ error: 'Host and port are required' }); + } + + const { rows } = await pool.query(` + INSERT INTO proxies (host, port, protocol, username, password, city, state, country, active) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, true) + RETURNING * + `, [host, port, protocol || 'http', username, password, city, state, country]); + + res.json({ proxy: rows[0] }); + } catch (error: any) { + console.error('[ProxyAdmin] Error adding proxy:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +/** + * DELETE /api/admin/proxies/:id + * Remove a static proxy + */ +router.delete('/:id', async (req: Request, res: Response) => { + try { + const { id } = req.params; + + await pool.query('DELETE FROM proxies WHERE id = $1', [id]); + + res.json({ success: true }); + } catch (error: any) { + console.error('[ProxyAdmin] Error deleting proxy:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +export default router; diff --git a/backend/src/services/crawl-rotator.ts b/backend/src/services/crawl-rotator.ts index 3165b694..98313462 100644 --- a/backend/src/services/crawl-rotator.ts +++ b/backend/src/services/crawl-rotator.ts @@ -900,6 +900,168 @@ export interface PreflightResult { responseTimeMs: number | null; } +// ============================================================ +// EVOMI GEO-TARGETING +// Dynamic proxy URL generation based on dispensary state +// ============================================================ + +/** + * State code to Evomi region ID mapping + * Format: lowercase, spaces replaced with dots + */ +const STATE_TO_REGION: Record = { + 'AL': 'alabama', 'AK': 'alaska', 'AZ': 'arizona', 'AR': 'arkansas', + 'CA': 'california', 'CO': 'colorado', 'CT': 'connecticut', 'DE': 'delaware', + 'FL': 'florida', 'GA': 'georgia', 'HI': 'hawaii', 'ID': 'idaho', + 'IL': 'illinois', 'IN': 'indiana', 'IA': 'iowa', 'KS': 'kansas', + 'KY': 'kentucky', 'LA': 'louisiana', 'ME': 'maine', 'MD': 'maryland', + 'MA': 'massachusetts', 'MI': 'michigan', 'MN': 'minnesota', 'MS': 'mississippi', + 'MO': 'missouri', 'MT': 'montana', 'NE': 'nebraska', 'NV': 'nevada', + 'NH': 'new.hampshire', 'NJ': 'new.jersey', 'NM': 'new.mexico', 'NY': 'new.york', + 'NC': 'north.carolina', 'ND': 'north.dakota', 'OH': 'ohio', 'OK': 'oklahoma', + 'OR': 'oregon', 'PA': 'pennsylvania', 'RI': 'rhode.island', 'SC': 'south.carolina', + 'SD': 'south.dakota', 'TN': 'tennessee', 'TX': 'texas', 'UT': 'utah', + 'VT': 'vermont', 'VA': 'virginia', 'WA': 'washington', 'WV': 'west.virginia', + 'WI': 'wisconsin', 'WY': 'wyoming', + // Canadian provinces + 'AB': 'alberta', 'BC': 'british.columbia', 'MB': 'manitoba', 'NB': 'new.brunswick', + 'NL': 'newfoundland.and.labrador', 'NS': 'nova.scotia', 'ON': 'ontario', + 'PE': 'prince.edward.island', 'QC': 'quebec', 'SK': 'saskatchewan', +}; + +/** + * Evomi proxy configuration from environment + */ +export interface EvomiConfig { + enabled: boolean; + user: string; + pass: string; + host: string; + port: number; + sessionLifetimeMinutes: number; +} + +/** + * Get Evomi configuration from environment variables + */ +export function getEvomiConfig(): EvomiConfig { + const user = process.env.EVOMI_USER || ''; + const pass = process.env.EVOMI_PASS || ''; + + return { + enabled: !!(user && pass), + user, + pass, + host: process.env.EVOMI_HOST || 'rpc.evomi.com', + port: parseInt(process.env.EVOMI_PORT || '1000', 10), + sessionLifetimeMinutes: parseInt(process.env.EVOMI_SESSION_LIFETIME || '30', 10), + }; +} + +/** + * Build a geo-targeted Evomi proxy URL + * + * @param stateCode - US state code (e.g., 'AZ') + * @param workerId - Worker ID for session stickiness + * @param city - Optional city name (lowercase, dots for spaces) + * @returns Proxy URL or null if Evomi not configured + */ +export function buildEvomiProxyUrl( + stateCode: string, + workerId: string, + city?: string +): { url: string; geo: string; source: 'api' } | null { + const config = getEvomiConfig(); + + if (!config.enabled) { + return null; + } + + const region = STATE_TO_REGION[stateCode.toUpperCase()]; + if (!region) { + console.warn(`[Evomi] Unknown state code: ${stateCode}`); + return null; + } + + // Generate session ID: workerId + region (sticky per worker per state) + const sessionId = `${workerId.replace(/[^a-zA-Z0-9]/g, '').slice(0, 6)}${region.slice(0, 4)}`; + + // Build geo target string + let geoParams = `_country-US_region-${region}`; + let geoDisplay = region; + + if (city) { + geoParams += `_city-${city}`; + geoDisplay = `${city}, ${region}`; + } + + // Build full proxy URL + // Format: http://user:pass_geo_session@host:port + const url = `http://${config.user}:${config.pass}${geoParams}_session-${sessionId}_lifetime-${config.sessionLifetimeMinutes}@${config.host}:${config.port}`; + + return { + url, + geo: geoDisplay, + source: 'api' as const, + }; +} + +/** + * Result from getProxyForTask + */ +export interface GeoProxyResult { + url: string; + ip: string | null; // Will be resolved after first request + geo: string; + source: 'api' | 'static'; +} + +/** + * Get a geo-targeted proxy for a specific task + * Prefers Evomi dynamic proxies, falls back to static table + * + * @param stateCode - Dispensary state code (e.g., 'AZ') + * @param workerId - Worker ID for session stickiness + * @param city - Optional city name + * @param rotator - CrawlRotator instance for fallback to static proxies + * @returns GeoProxyResult with URL and metadata + */ +export function getProxyForTask( + stateCode: string, + workerId: string, + city?: string, + rotator?: CrawlRotator +): GeoProxyResult | null { + // Try Evomi first + const evomiProxy = buildEvomiProxyUrl(stateCode, workerId, city); + if (evomiProxy) { + console.log(`[GeoProxy] Using Evomi for ${stateCode}: ${evomiProxy.geo}`); + return { + url: evomiProxy.url, + ip: null, // Will be resolved after connection + geo: evomiProxy.geo, + source: 'api', + }; + } + + // Fall back to static proxy table + if (rotator) { + const staticProxy = rotator.proxy.getCurrent(); + if (staticProxy) { + console.log(`[GeoProxy] Falling back to static proxy: ${staticProxy.host}`); + return { + url: rotator.proxy.getProxyUrl(staticProxy), + ip: staticProxy.host, + geo: staticProxy.state || staticProxy.country || 'unknown', + source: 'static', + }; + } + } + + console.warn(`[GeoProxy] No proxy available for ${stateCode}`); + return null; +} + // ============================================================ // SINGLETON INSTANCES // ============================================================ diff --git a/backend/src/services/task-scheduler.ts b/backend/src/services/task-scheduler.ts index d2103ff4..2c59e411 100644 --- a/backend/src/services/task-scheduler.ts +++ b/backend/src/services/task-scheduler.ts @@ -244,15 +244,22 @@ class TaskScheduler { * - Different crawl frequencies per state (e.g., AZ=4h, MI=6h) * - Better rate limit management (one state at a time) * - Easier debugging and monitoring per state + * + * Platform-aware: Each task gets platform from dispensary.menu_type + * (not from schedule.platform) so Jane/Dutchie stores route correctly. */ private async generateProductDiscoveryTasks(schedule: TaskSchedule): Promise { - let dispensaryIds: number[] = []; + interface DispensaryRow { + id: number; + menu_type: string | null; + } + let dispensaries: DispensaryRow[] = []; // Single-dispensary schedule (e.g., "Deeply Rooted Hourly") if (schedule.dispensary_id) { // Check if this specific store needs refresh const result = await pool.query(` - SELECT d.id + SELECT d.id, d.menu_type FROM dispensaries d WHERE d.id = $1 AND d.crawl_enabled = true @@ -266,9 +273,9 @@ class TaskScheduler { ) `, [schedule.dispensary_id]); - dispensaryIds = result.rows.map((r: { id: number }) => r.id); + dispensaries = result.rows; - if (dispensaryIds.length === 0) { + if (dispensaries.length === 0) { console.log(`[TaskScheduler] Store ${schedule.dispensary_id} has pending task or is disabled`); return 0; } @@ -277,9 +284,9 @@ class TaskScheduler { } // Per-state schedule (e.g., "AZ Product Refresh") else if (schedule.state_code) { - // Find stores in this state needing refresh + // Find stores in this state needing refresh (include menu_type for platform routing) const result = await pool.query(` - SELECT d.id + SELECT d.id, d.menu_type FROM dispensaries d JOIN states s ON d.state_id = s.id WHERE d.crawl_enabled = true @@ -300,14 +307,14 @@ class TaskScheduler { ORDER BY d.last_fetch_at NULLS FIRST, d.id `, [schedule.state_code, schedule.interval_hours]); - dispensaryIds = result.rows.map((r: { id: number }) => r.id); + dispensaries = result.rows; - if (dispensaryIds.length === 0) { + if (dispensaries.length === 0) { console.log(`[TaskScheduler] No stores in ${schedule.state_code} need refresh`); return 0; } - console.log(`[TaskScheduler] Creating ${dispensaryIds.length} product_discovery tasks for ${schedule.state_code}`); + console.log(`[TaskScheduler] Creating ${dispensaries.length} product_discovery tasks for ${schedule.state_code}`); } // No dispensary_id or state_code - invalid schedule else { @@ -315,21 +322,37 @@ class TaskScheduler { return 0; } - // Create product_discovery tasks with HTTP transport - // No stagger - worker controls pacing - const { created } = await taskService.createStaggeredTasks( - dispensaryIds, - 'product_discovery', - 0, // No stagger - worker controls pacing - schedule.platform || 'dutchie', - 'http', // Force HTTP transport - { - source: 'schedule', - source_schedule_id: schedule.id, + // Group dispensaries by platform (menu_type) + const byPlatform: Record = {}; + for (const d of dispensaries) { + const platform = d.menu_type || 'dutchie'; + if (!byPlatform[platform]) { + byPlatform[platform] = []; } - ); + byPlatform[platform].push(d.id); + } - return created; + // Create tasks per platform so each task routes to the correct handler + let totalCreated = 0; + for (const [platform, ids] of Object.entries(byPlatform)) { + if (ids.length > 0) { + console.log(`[TaskScheduler] Creating ${ids.length} product_discovery tasks for platform=${platform}`); + const { created } = await taskService.createStaggeredTasks( + ids, + 'product_discovery', + 0, // No stagger - worker controls pacing + platform, + 'http', // Force HTTP transport + { + source: 'schedule', + source_schedule_id: schedule.id, + } + ); + totalCreated += created; + } + } + + return totalCreated; } /** diff --git a/backend/src/tasks/handlers/payload-fetch-curl.ts b/backend/src/tasks/handlers/_deprecated-payload-fetch-curl.ts similarity index 96% rename from backend/src/tasks/handlers/payload-fetch-curl.ts rename to backend/src/tasks/handlers/_deprecated-payload-fetch-curl.ts index 5d803ffa..44f7e59c 100644 --- a/backend/src/tasks/handlers/payload-fetch-curl.ts +++ b/backend/src/tasks/handlers/_deprecated-payload-fetch-curl.ts @@ -1,4 +1,12 @@ /** + * @deprecated DO NOT USE - This file is deprecated. + * Use product-discovery-dutchie.ts instead (HTTP/Puppeteer transport). + * + * This was the old curl-based payload fetch handler. + * Kept for historical reference only. + * + * --- + * Original docs: * Payload Fetch Handler * * Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing. diff --git a/backend/src/tasks/handlers/product-discovery-curl.ts b/backend/src/tasks/handlers/_deprecated-product-discovery-curl.ts similarity index 76% rename from backend/src/tasks/handlers/product-discovery-curl.ts rename to backend/src/tasks/handlers/_deprecated-product-discovery-curl.ts index 187e458d..0260ebb0 100644 --- a/backend/src/tasks/handlers/product-discovery-curl.ts +++ b/backend/src/tasks/handlers/_deprecated-product-discovery-curl.ts @@ -1,4 +1,12 @@ /** + * @deprecated DO NOT USE - This file is deprecated. + * Use product-discovery-dutchie.ts instead (HTTP/Puppeteer transport). + * + * This was the old curl-based product discovery handler. + * Kept for historical reference only. + * + * --- + * Original docs: * Product Discovery Handler * * Per TASK_WORKFLOW_2024-12-10.md: Initial product fetch for newly discovered stores. @@ -13,7 +21,7 @@ */ import { TaskContext, TaskResult } from '../task-worker'; -import { handlePayloadFetch } from './payload-fetch-curl'; +import { handlePayloadFetch } from './_deprecated-payload-fetch-curl'; export async function handleProductDiscovery(ctx: TaskContext): Promise { const { task } = ctx; diff --git a/backend/src/tasks/handlers/store-discovery.ts b/backend/src/tasks/handlers/_deprecated-store-discovery-curl.ts similarity index 92% rename from backend/src/tasks/handlers/store-discovery.ts rename to backend/src/tasks/handlers/_deprecated-store-discovery-curl.ts index 017888ad..42dfdea9 100644 --- a/backend/src/tasks/handlers/store-discovery.ts +++ b/backend/src/tasks/handlers/_deprecated-store-discovery-curl.ts @@ -1,4 +1,12 @@ /** + * @deprecated DO NOT USE - This file is deprecated. + * Use store-discovery-dutchie.ts instead (HTTP/Puppeteer transport). + * + * This was the old curl-based store discovery handler. + * Kept for historical reference only. + * + * --- + * Original docs: * Store Discovery Handler * * Per TASK_WORKFLOW_2024-12-10.md: Discovers new stores and returns their IDs for task chaining. diff --git a/backend/src/tasks/handlers/entry-point-discovery-jane.ts b/backend/src/tasks/handlers/entry-point-discovery-jane.ts new file mode 100644 index 00000000..adbc6b9c --- /dev/null +++ b/backend/src/tasks/handlers/entry-point-discovery-jane.ts @@ -0,0 +1,162 @@ +/** + * Jane Entry Point Discovery Handler + * + * Resolves a dispensary's menu_url to a Jane store ID (platform_dispensary_id). + * + * Flow: + * 1. Load dispensary with menu_url + * 2. Navigate to menu URL and capture store API call + * 3. Extract store ID from API response + * 4. Update dispensary.platform_dispensary_id + * 5. Queue product_discovery task + */ + +import { TaskContext, TaskResult } from '../task-worker'; +import { + startSession, + endSession, + setCrawlRotator, + resolveStoreFromUrl, +} from '../../platforms/jane'; +import { taskService } from '../task-service'; + +export async function handleEntryPointDiscoveryJane(ctx: TaskContext): Promise { + const { pool, task, crawlRotator } = ctx; + const dispensaryId = task.dispensary_id; + + if (!dispensaryId) { + return { + success: false, + error: 'Missing dispensary_id in task', + }; + } + + console.log(`[JaneEntryPoint] Starting for dispensary ${dispensaryId}`); + + try { + // Load dispensary + const dispResult = await pool.query( + `SELECT id, name, menu_url, platform_dispensary_id, menu_type + FROM dispensaries WHERE id = $1`, + [dispensaryId] + ); + + if (dispResult.rows.length === 0) { + return { + success: false, + error: `Dispensary ${dispensaryId} not found`, + }; + } + + const dispensary = dispResult.rows[0]; + + // Skip if already resolved + if (dispensary.platform_dispensary_id) { + console.log(`[JaneEntryPoint] Already resolved: ${dispensary.platform_dispensary_id}`); + return { + success: true, + alreadyResolved: true, + storeId: dispensary.platform_dispensary_id, + }; + } + + if (!dispensary.menu_url) { + return { + success: false, + error: `Dispensary ${dispensaryId} has no menu_url`, + }; + } + + console.log(`[JaneEntryPoint] Resolving: ${dispensary.menu_url}`); + + // Attach crawl rotator + if (crawlRotator) { + setCrawlRotator(crawlRotator); + } + + // Resolve store from URL + const result = await resolveStoreFromUrl(dispensary.menu_url); + + if (!result) { + // Update dispensary with resolution failure + await pool.query( + `UPDATE dispensaries + SET id_resolution_status = 'failed', + last_id_resolution_at = NOW(), + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ); + + return { + success: false, + error: 'Could not resolve Jane store ID from URL', + }; + } + + const { storeId, store } = result; + console.log(`[JaneEntryPoint] Resolved store ID: ${storeId} (${store.name})`); + + // Update dispensary with resolved store ID and store data + await pool.query( + `UPDATE dispensaries + SET platform_dispensary_id = $1, + menu_type = 'jane', + id_resolution_status = 'resolved', + last_id_resolution_at = NOW(), + stage = 'promoted', + latitude = COALESCE(latitude, $2), + longitude = COALESCE(longitude, $3), + phone = COALESCE(phone, $4), + is_medical = $5, + is_recreational = $6, + updated_at = NOW() + WHERE id = $7`, + [ + storeId, + store.lat || null, + store.long || null, + store.phone || null, + store.medical, + store.recreational, + dispensaryId, + ] + ); + + // Queue product_discovery task + console.log(`[JaneEntryPoint] Queuing product_discovery for dispensary ${dispensaryId}`); + await taskService.createTask({ + role: 'product_discovery', + dispensary_id: dispensaryId, + platform: 'jane', + method: 'http', + priority: task.priority || 0, + }); + + return { + success: true, + storeId, + storeName: store.name, + productCount: store.product_count, + queuedProductDiscovery: true, + }; + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + console.error(`[JaneEntryPoint] Error:`, errorMessage); + + // Update dispensary with failure + await pool.query( + `UPDATE dispensaries + SET id_resolution_status = 'failed', + consecutive_failures = consecutive_failures + 1, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ).catch(() => {}); + + return { + success: false, + error: errorMessage, + }; + } +} diff --git a/backend/src/tasks/handlers/index.ts b/backend/src/tasks/handlers/index.ts index 7b8cf76c..24525c77 100644 --- a/backend/src/tasks/handlers/index.ts +++ b/backend/src/tasks/handlers/index.ts @@ -3,18 +3,27 @@ * * Exports all task handlers for the task worker. * - * Product Discovery: - * - handleProductDiscoveryCurl: curl/axios based (for curl transport) - * - handleProductDiscoveryHttp: Puppeteer browser-based (for http transport) + * Naming convention: {task}-{platform}.ts + * - product-discovery-dutchie.ts + * - product-discovery-jane.ts + * - store-discovery-dutchie.ts + * - store-discovery-jane.ts + * + * Deprecated handlers (curl transport) are in _deprecated-*.ts files. */ -export { handleProductDiscovery as handleProductDiscoveryCurl } from './product-discovery-curl'; -export { handleProductDiscoveryHttp } from './product-discovery-http'; -export { handlePayloadFetch as handlePayloadFetchCurl } from './payload-fetch-curl'; +// Shared handlers (platform-agnostic) export { handleProductRefresh } from './product-refresh'; -export { handleStoreDiscovery } from './store-discovery'; -export { handleStoreDiscoveryHttp } from './store-discovery-http'; export { handleStoreDiscoveryState } from './store-discovery-state'; export { handleEntryPointDiscovery } from './entry-point-discovery'; export { handleAnalyticsRefresh } from './analytics-refresh'; export { handleWhoami } from './whoami'; + +// Dutchie Platform Handlers +export { handleProductDiscoveryDutchie } from './product-discovery-dutchie'; +export { handleStoreDiscoveryDutchie } from './store-discovery-dutchie'; + +// Jane Platform Handlers +export { handleStoreDiscoveryJane } from './store-discovery-jane'; +export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane'; +export { handleProductDiscoveryJane } from './product-discovery-jane'; diff --git a/backend/src/tasks/handlers/product-discovery-http.ts b/backend/src/tasks/handlers/product-discovery-dutchie.ts similarity index 98% rename from backend/src/tasks/handlers/product-discovery-http.ts rename to backend/src/tasks/handlers/product-discovery-dutchie.ts index 23e188e1..23889069 100644 --- a/backend/src/tasks/handlers/product-discovery-http.ts +++ b/backend/src/tasks/handlers/product-discovery-dutchie.ts @@ -1,15 +1,17 @@ /** - * Product Discovery HTTP Handler (Browser-based) + * Product Discovery Handler - Dutchie Platform * * Uses Puppeteer + StealthPlugin to fetch products via browser context. * Based on test-intercept.js pattern from ORGANIC_SCRAPING_GUIDE.md. * + * Naming convention: {task}-{platform}.ts + * * This handler: * 1. Loads dispensary info * 2. Launches headless browser with proxy (if provided) * 3. Establishes session by visiting embedded menu * 4. Fetches ALL products via GraphQL from browser context - * 5. Saves raw payload to filesystem (gzipped) + * 5. Saves raw payload to filesystem (gzipped) at payloads/dutchie/... * 6. Records metadata in raw_crawl_payloads table * 7. Queues product_refresh task to process the payload * @@ -26,7 +28,7 @@ import { taskService } from '../task-service'; // GraphQL hash for FilteredProducts query - MUST match CLAUDE.md const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; -export async function handleProductDiscoveryHttp(ctx: TaskContext): Promise { +export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise { const { pool, task, crawlRotator, updateStep } = ctx; const dispensaryId = task.dispensary_id; diff --git a/backend/src/tasks/handlers/product-discovery-jane.ts b/backend/src/tasks/handlers/product-discovery-jane.ts new file mode 100644 index 00000000..ca1bec01 --- /dev/null +++ b/backend/src/tasks/handlers/product-discovery-jane.ts @@ -0,0 +1,168 @@ +/** + * Jane Product Discovery Handler + * + * Fetches all products from a Jane store via Puppeteer + network interception. + * + * Flow: + * 1. Load dispensary with platform_dispensary_id + * 2. Navigate to menu URL and capture Algolia product responses + * 3. Save raw payload to filesystem + * 4. Queue product_refresh task for normalization + */ + +import { TaskContext, TaskResult } from '../task-worker'; +import { + startSession, + endSession, + setCrawlRotator, + fetchProductsFromUrl, +} from '../../platforms/jane'; +import { saveRawPayload } from '../../utils/payload-storage'; +import { taskService } from '../task-service'; + +export async function handleProductDiscoveryJane(ctx: TaskContext): Promise { + const { pool, task, crawlRotator } = ctx; + const dispensaryId = task.dispensary_id; + + if (!dispensaryId) { + return { + success: false, + error: 'Missing dispensary_id in task', + }; + } + + console.log(`[JaneProductDiscovery] Starting for dispensary ${dispensaryId}`); + + try { + // Load dispensary + const dispResult = await pool.query( + `SELECT id, name, menu_url, platform_dispensary_id, menu_type + FROM dispensaries WHERE id = $1`, + [dispensaryId] + ); + + if (dispResult.rows.length === 0) { + return { + success: false, + error: `Dispensary ${dispensaryId} not found`, + }; + } + + const dispensary = dispResult.rows[0]; + + if (!dispensary.menu_url) { + return { + success: false, + error: `Dispensary ${dispensaryId} has no menu_url`, + }; + } + + console.log(`[JaneProductDiscovery] Fetching products from: ${dispensary.menu_url}`); + + // Attach crawl rotator + if (crawlRotator) { + setCrawlRotator(crawlRotator); + } + + // Fetch products + const result = await fetchProductsFromUrl(dispensary.menu_url); + + if (result.products.length === 0) { + console.warn(`[JaneProductDiscovery] No products captured for dispensary ${dispensaryId}`); + + // Update dispensary with failure + await pool.query( + `UPDATE dispensaries + SET consecutive_failures = consecutive_failures + 1, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ); + + return { + success: false, + error: 'No products captured from Jane menu page', + productCount: 0, + }; + } + + console.log(`[JaneProductDiscovery] Captured ${result.products.length} products`); + + // Build payload for storage + // Store the raw Algolia hits for the normalizer + const rawPayload = { + hits: result.products.map(p => p.raw), // Use raw product data + store: result.store?.raw || null, + capturedAt: new Date().toISOString(), + platform: 'jane', + dispensaryId, + storeId: dispensary.platform_dispensary_id, + }; + + // Save raw payload to filesystem (platform = 'jane') + const { id: payloadId, sizeBytes } = await saveRawPayload( + pool, + dispensaryId, + rawPayload, + null, // crawl_run_id + result.products.length, + 'jane' // platform + ); + + console.log(`[JaneProductDiscovery] Saved payload ${payloadId} (${Math.round(sizeBytes / 1024)}KB)`); + + // Update dispensary stage and timestamps + await pool.query( + `UPDATE dispensaries + SET stage = 'hydrating', + last_fetch_at = NOW(), + consecutive_successes = consecutive_successes + 1, + consecutive_failures = 0, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ); + + // Queue product_refresh task for normalization + console.log(`[JaneProductDiscovery] Queuing product_refresh for payload ${payloadId}`); + await taskService.createTask({ + role: 'product_refresh', + dispensary_id: dispensaryId, + platform: 'jane', + // method undefined = any worker can process (product_refresh is local) + priority: task.priority || 0, + payload: { payload_id: payloadId }, + }); + + return { + success: true, + productCount: result.products.length, + payloadId, + payloadSizeKB: Math.round(sizeBytes / 1024), + storeInfo: result.store ? { + id: result.store.id, + name: result.store.name, + productCount: result.store.product_count, + } : null, + queuedProductRefresh: true, + }; + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + console.error(`[JaneProductDiscovery] Error:`, errorMessage); + + // Update dispensary with failure + await pool.query( + `UPDATE dispensaries + SET consecutive_failures = consecutive_failures + 1, + stage = CASE WHEN consecutive_failures >= 2 THEN 'failing' ELSE stage END, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ).catch(() => {}); + + return { + success: false, + error: errorMessage, + }; + } +} diff --git a/backend/src/tasks/handlers/product-refresh.ts b/backend/src/tasks/handlers/product-refresh.ts index e3f05bfa..434f4420 100644 --- a/backend/src/tasks/handlers/product-refresh.ts +++ b/backend/src/tasks/handlers/product-refresh.ts @@ -8,19 +8,23 @@ * * Flow: * 1. Load payload from filesystem (by payload_id or latest for dispensary) - * 2. Normalize data via DutchieNormalizer - * 3. Upsert to store_products and store_product_snapshots - * 4. Track missing products (increment consecutive_misses, mark OOS at 3) - * 5. Download new product images + * 2. Select normalizer based on dispensary.menu_type (dutchie or jane) + * 3. Normalize data via platform-specific normalizer + * 4. Upsert to store_products and store_product_snapshots + * 5. Track missing products (increment consecutive_misses, mark OOS at 3) + * 6. Download new product images * * Benefits of separation: * - Retry-friendly: If this fails, re-run without re-crawling * - Replay-able: Run against any historical payload * - Faster: Local file read vs network call + * - Platform-agnostic: Same handler works for Dutchie and Jane */ import { TaskContext, TaskResult } from '../task-worker'; import { DutchieNormalizer } from '../../hydration/normalizers/dutchie'; +import { JaneNormalizer } from '../../hydration/normalizers/jane'; +import { BaseNormalizer } from '../../hydration/normalizers/base'; import { upsertStoreProducts, createStoreProductSnapshots, @@ -29,7 +33,19 @@ import { import { loadRawPayloadById, getLatestPayload } from '../../utils/payload-storage'; import { taskService } from '../task-service'; -const normalizer = new DutchieNormalizer(); +// Platform-aware normalizer registry +const NORMALIZERS: Record = { + dutchie: new DutchieNormalizer(), + jane: new JaneNormalizer(), +}; + +/** + * Get normalizer for a platform, defaults to Dutchie + */ +function getNormalizer(platform: string | null): BaseNormalizer { + const key = platform || 'dutchie'; + return NORMALIZERS[key] || NORMALIZERS.dutchie; +} export async function handleProductRefresh(ctx: TaskContext): Promise { const { pool, task, updateStep } = ctx; @@ -90,7 +106,9 @@ export async function handleProductRefresh(ctx: TaskContext): Promise 0) { @@ -179,10 +206,11 @@ export async function handleProductRefresh(ctx: TaskContext): Promise { +export async function handleStoreDiscoveryDutchie(ctx: TaskContext): Promise { const { pool, task, crawlRotator, updateStep } = ctx; const platform = task.platform || 'dutchie'; diff --git a/backend/src/tasks/handlers/store-discovery-jane.ts b/backend/src/tasks/handlers/store-discovery-jane.ts new file mode 100644 index 00000000..afc16782 --- /dev/null +++ b/backend/src/tasks/handlers/store-discovery-jane.ts @@ -0,0 +1,166 @@ +/** + * Jane Store Discovery Handler + * + * Discovers iHeartJane stores by state and inserts them into the dispensaries table. + * + * Flow: + * 1. Navigate to Jane's store directory + * 2. Capture store data from API responses + * 3. Insert new stores into dispensaries table with menu_type='jane' + * 4. Return newStoreIds[] for chaining to product_discovery + */ + +import { Pool } from 'pg'; +import { TaskContext, TaskResult } from '../task-worker'; +import { + startSession, + endSession, + setCrawlRotator, + discoverStoresByState, + DiscoveredStore, +} from '../../platforms/jane'; + +export async function handleStoreDiscoveryJane(ctx: TaskContext): Promise { + const { pool, task, crawlRotator } = ctx; + + // Get state from task payload + const stateCode = task.payload?.state as string; + if (!stateCode) { + return { + success: false, + error: 'Missing state in task payload', + newStoreIds: [], + }; + } + + console.log(`[JaneStoreDiscovery] Starting discovery for state: ${stateCode}`); + + try { + // Attach crawl rotator if available + if (crawlRotator) { + setCrawlRotator(crawlRotator); + } + + // Discover stores + const stores = await discoverStoresByState(stateCode); + + if (stores.length === 0) { + console.log(`[JaneStoreDiscovery] No stores found in ${stateCode}`); + return { + success: true, + storesDiscovered: 0, + storesInserted: 0, + newStoreIds: [], + message: `No Jane stores found in ${stateCode}`, + }; + } + + console.log(`[JaneStoreDiscovery] Found ${stores.length} stores in ${stateCode}`); + + // Insert stores into dispensaries table + const { inserted, newIds } = await insertJaneStores(pool, stores, stateCode); + + console.log(`[JaneStoreDiscovery] Inserted ${inserted} new stores, ${newIds.length} IDs for chaining`); + + return { + success: true, + storesDiscovered: stores.length, + storesInserted: inserted, + newStoreIds: newIds, + state: stateCode, + }; + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + console.error(`[JaneStoreDiscovery] Error:`, errorMessage); + return { + success: false, + error: errorMessage, + newStoreIds: [], + }; + } +} + +/** + * Insert discovered Jane stores into dispensaries table + */ +async function insertJaneStores( + pool: Pool, + stores: DiscoveredStore[], + stateCode: string +): Promise<{ inserted: number; newIds: number[] }> { + const newIds: number[] = []; + let inserted = 0; + + for (const store of stores) { + try { + // Build menu URL from store data + const menuUrl = `https://www.iheartjane.com/stores/${store.storeId}/${store.urlSlug || 'menu'}`; + + // Upsert into dispensaries + // Use platform_dispensary_id as conflict key to avoid duplicates + const result = await pool.query( + `INSERT INTO dispensaries ( + name, + address, + city, + state, + zip, + latitude, + longitude, + menu_url, + menu_type, + platform_dispensary_id, + is_medical, + is_recreational, + stage, + created_at, + updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, NOW(), NOW()) + ON CONFLICT (menu_type, platform_dispensary_id) + WHERE menu_type = 'jane' AND platform_dispensary_id IS NOT NULL + DO UPDATE SET + name = EXCLUDED.name, + address = EXCLUDED.address, + city = EXCLUDED.city, + latitude = EXCLUDED.latitude, + longitude = EXCLUDED.longitude, + menu_url = EXCLUDED.menu_url, + is_medical = EXCLUDED.is_medical, + is_recreational = EXCLUDED.is_recreational, + updated_at = NOW() + RETURNING id, (xmax = 0) AS is_new`, + [ + store.name, + store.address, + store.city, + stateCode, + store.zip, + store.lat, + store.long, + menuUrl, + 'jane', + store.storeId, + store.medical, + store.recreational, + 'discovered', + ] + ); + + if (result.rows.length > 0) { + const { id, is_new } = result.rows[0]; + if (is_new) { + newIds.push(id); + inserted++; + console.log(`[JaneStoreDiscovery] Inserted: ${store.name} (ID: ${id}, Jane ID: ${store.storeId})`); + } else { + console.log(`[JaneStoreDiscovery] Updated: ${store.name} (ID: ${id})`); + } + } + } catch (error: any) { + // Log but continue with other stores + console.error(`[JaneStoreDiscovery] Error inserting ${store.name}:`, error.message); + } + } + + return { inserted, newIds }; +} diff --git a/backend/src/tasks/index.ts b/backend/src/tasks/index.ts index 713aa6db..a81334ca 100644 --- a/backend/src/tasks/index.ts +++ b/backend/src/tasks/index.ts @@ -17,10 +17,15 @@ export { export { TaskWorker, TaskContext, TaskResult } from './task-worker'; export { - handleProductDiscoveryCurl, - handleProductDiscoveryHttp, + // Shared handlers handleProductRefresh, - handleStoreDiscovery, handleEntryPointDiscovery, handleAnalyticsRefresh, + // Dutchie handlers + handleProductDiscoveryDutchie, + handleStoreDiscoveryDutchie, + // Jane handlers + handleProductDiscoveryJane, + handleStoreDiscoveryJane, + handleEntryPointDiscoveryJane, } from './handlers'; diff --git a/backend/src/tasks/task-worker.ts b/backend/src/tasks/task-worker.ts index 5df072d7..8011f5e7 100644 --- a/backend/src/tasks/task-worker.ts +++ b/backend/src/tasks/task-worker.ts @@ -68,19 +68,22 @@ import { runCurlPreflight, CurlPreflightResult } from '../services/curl-prefligh import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight'; // Task handlers by role -// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch and product_refresh are now separate -// Dual-transport: curl vs http (browser-based) handlers -import { handlePayloadFetch } from './handlers/payload-fetch-curl'; +// Platform-based handlers: {task}-{platform}.ts convention import { handleProductRefresh } from './handlers/product-refresh'; -import { handleProductDiscovery } from './handlers/product-discovery-curl'; -import { handleProductDiscoveryHttp } from './handlers/product-discovery-http'; -import { handleStoreDiscovery } from './handlers/store-discovery'; -import { handleStoreDiscoveryHttp } from './handlers/store-discovery-http'; import { handleStoreDiscoveryState } from './handlers/store-discovery-state'; import { handleEntryPointDiscovery } from './handlers/entry-point-discovery'; import { handleAnalyticsRefresh } from './handlers/analytics-refresh'; import { handleWhoami } from './handlers/whoami'; +// Dutchie Platform Handlers +import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie'; +import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie'; + +// Jane Platform Handlers +import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane'; +import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane'; +import { handleProductDiscoveryJane } from './handlers/product-discovery-jane'; + const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000'); const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000'); const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010'; @@ -152,48 +155,66 @@ export interface TaskResult { type TaskHandler = (ctx: TaskContext) => Promise; // Per TASK_WORKFLOW_2024-12-10.md: Handler registry -// payload_fetch: Fetches from Dutchie API, saves to disk -// product_refresh: Reads local payload, normalizes, upserts to DB -// product_discovery: Main handler for product crawling (has curl and http variants) -const TASK_HANDLERS: Record = { - payload_fetch: handlePayloadFetch, // API fetch -> disk (curl) - product_refresh: handleProductRefresh, // disk -> DB - product_discovery: handleProductDiscovery, // Default: curl (see getHandlerForTask for http override) - store_discovery: handleStoreDiscovery, - store_discovery_state: handleStoreDiscoveryState, // Per-state parallelized discovery +// Platform-agnostic handlers (shared across Dutchie and Jane) +// product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB +const SHARED_HANDLERS: Partial> = { + product_refresh: handleProductRefresh, + store_discovery_state: handleStoreDiscoveryState, entry_point_discovery: handleEntryPointDiscovery, analytics_refresh: handleAnalyticsRefresh, - whoami: handleWhoami, // Tests proxy + anti-detect + whoami: handleWhoami, }; /** - * Get the appropriate handler for a task, considering both role and method. + * Get the appropriate handler for a task based on platform. * - * Dual-transport handlers: - * - product_discovery: curl (axios) or http (Puppeteer) - * - store_discovery: curl (axios) or http (Puppeteer) + * Naming convention: {task}-{platform}.ts + * - product-discovery-dutchie.ts + * - product-discovery-jane.ts + * - store-discovery-dutchie.ts + * - store-discovery-jane.ts * - * Default method is 'http' since all GraphQL queries should use browser transport - * for better TLS fingerprinting and session-based proxy compatibility. + * All handlers use HTTP/Puppeteer transport (curl transport is deprecated). */ function getHandlerForTask(task: WorkerTask): TaskHandler | undefined { const role = task.role as TaskRole; - const method = task.method || 'http'; // Default to HTTP for all GraphQL tasks + const platform = task.platform || 'dutchie'; - // product_discovery: dual-transport support - if (role === 'product_discovery' && method === 'http') { - console.log(`[TaskWorker] Using HTTP handler for product_discovery (method=${method})`); - return handleProductDiscoveryHttp; + // ========================================================================== + // JANE PLATFORM ROUTING + // ========================================================================== + if (platform === 'jane') { + if (role === 'store_discovery' || role === 'store_discovery_state') { + console.log(`[TaskWorker] Using Jane handler for store_discovery`); + return handleStoreDiscoveryJane; + } + if (role === 'entry_point_discovery') { + console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`); + return handleEntryPointDiscoveryJane; + } + if (role === 'product_discovery') { + console.log(`[TaskWorker] Using Jane handler for product_discovery`); + return handleProductDiscoveryJane; + } } - // store_discovery: dual-transport support - if (role === 'store_discovery' && method === 'http') { - console.log(`[TaskWorker] Using HTTP handler for store_discovery (method=${method})`); - return handleStoreDiscoveryHttp; + // ========================================================================== + // DUTCHIE PLATFORM ROUTING (default) + // ========================================================================== + if (role === 'product_discovery') { + console.log(`[TaskWorker] Using Dutchie handler for product_discovery`); + return handleProductDiscoveryDutchie; } - // Default: use the static handler registry (curl-based) - return TASK_HANDLERS[role]; + if (role === 'store_discovery') { + console.log(`[TaskWorker] Using Dutchie handler for store_discovery`); + return handleStoreDiscoveryDutchie; + } + + // ========================================================================== + // SHARED HANDLERS (platform-agnostic) + // ========================================================================== + return SHARED_HANDLERS[role]; } /** diff --git a/backend/src/utils/payload-storage.ts b/backend/src/utils/payload-storage.ts index 9a481c91..c4445ed2 100644 --- a/backend/src/utils/payload-storage.ts +++ b/backend/src/utils/payload-storage.ts @@ -1,17 +1,23 @@ /** * Payload Storage Utility * - * Per TASK_WORKFLOW_2024-12-10.md: Store raw GraphQL payloads for historical analysis. + * Per TASK_WORKFLOW_2024-12-10.md: Store raw API payloads for historical analysis. * * Design Pattern: Metadata/Payload Separation * - Metadata in PostgreSQL (raw_crawl_payloads table): Small, indexed, queryable * - Payload stored in MinIO/S3 (or local filesystem as fallback): Gzipped JSON * * Storage structure (MinIO): - * cannaiq/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * cannaiq/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz * * Storage structure (Local fallback): - * ./storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * ./storage/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * + * Platform values: 'dutchie', 'jane' + * + * Examples: + * payloads/dutchie/2024/12/13/store_456_1734105600000.json.gz + * payloads/jane/2024/12/13/store_2788_1734105600000.json.gz * * Benefits: * - Compare any two crawls to see what changed @@ -20,6 +26,7 @@ * - DB stays small, backups stay fast * - ~90% compression (1.5MB -> 150KB per crawl) * - Shared storage accessible by all worker pods (MinIO) + * - Platform separation for different retention/management policies */ import * as fs from 'fs'; @@ -98,23 +105,25 @@ export interface LoadPayloadResult { /** * Generate storage path/key for a payload * - * MinIO format: payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz - * Local format: ./storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * MinIO format: payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * Local format: ./storage/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz + * + * Platform defaults to 'dutchie' for backward compatibility */ -function generateStoragePath(dispensaryId: number, timestamp: Date): string { +function generateStoragePath(dispensaryId: number, timestamp: Date, platform: string = 'dutchie'): string { const year = timestamp.getFullYear(); const month = String(timestamp.getMonth() + 1).padStart(2, '0'); const day = String(timestamp.getDate()).padStart(2, '0'); const ts = timestamp.getTime(); - const relativePath = `payloads/${year}/${month}/${day}/store_${dispensaryId}_${ts}.json.gz`; + const relativePath = `payloads/${platform}/${year}/${month}/${day}/store_${dispensaryId}_${ts}.json.gz`; if (useMinIO) { // MinIO uses forward slashes, no leading slash return relativePath; } else { // Local filesystem uses OS-specific path - return path.join(PAYLOAD_BASE_PATH, String(year), month, day, `store_${dispensaryId}_${ts}.json.gz`); + return path.join(PAYLOAD_BASE_PATH, platform, String(year), month, day, `store_${dispensaryId}_${ts}.json.gz`); } } @@ -138,9 +147,10 @@ function calculateChecksum(data: Buffer): string { * * @param pool - Database connection pool * @param dispensaryId - ID of the dispensary - * @param payload - Raw JSON payload from GraphQL + * @param payload - Raw JSON payload from GraphQL/API * @param crawlRunId - Optional crawl_run ID for linking * @param productCount - Number of products in payload + * @param platform - Platform identifier ('dutchie' | 'jane'), defaults to 'dutchie' * @returns SavePayloadResult with file info and DB record ID */ export async function saveRawPayload( @@ -148,10 +158,11 @@ export async function saveRawPayload( dispensaryId: number, payload: any, crawlRunId: number | null = null, - productCount: number = 0 + productCount: number = 0, + platform: string = 'dutchie' ): Promise { const timestamp = new Date(); - const storagePath = generateStoragePath(dispensaryId, timestamp); + const storagePath = generateStoragePath(dispensaryId, timestamp, platform); // Serialize and compress const jsonStr = JSON.stringify(payload); diff --git a/cannaiq/src/App.tsx b/cannaiq/src/App.tsx index b7b1c426..4c9de26c 100755 --- a/cannaiq/src/App.tsx +++ b/cannaiq/src/App.tsx @@ -47,6 +47,7 @@ import CrossStateCompare from './pages/CrossStateCompare'; import StateDetail from './pages/StateDetail'; import { Discovery } from './pages/Discovery'; import { WorkersDashboard } from './pages/WorkersDashboard'; +import { ProxyManagement } from './pages/ProxyManagement'; import TasksDashboard from './pages/TasksDashboard'; import { ScraperOverviewDashboard } from './pages/ScraperOverviewDashboard'; import { SeoOrchestrator } from './pages/admin/seo/SeoOrchestrator'; @@ -124,6 +125,8 @@ export default function App() { } /> {/* Workers Dashboard */} } /> + {/* Proxy Management */} + } /> {/* Task Queue Dashboard */} } /> {/* Scraper Overview Dashboard (new primary) */} diff --git a/cannaiq/src/pages/ProxyManagement.tsx b/cannaiq/src/pages/ProxyManagement.tsx new file mode 100644 index 00000000..afb14614 --- /dev/null +++ b/cannaiq/src/pages/ProxyManagement.tsx @@ -0,0 +1,436 @@ +import { useState, useEffect, useCallback } from 'react'; +import { Layout } from '../components/Layout'; +import { api } from '../lib/api'; +import { + Globe, + RefreshCw, + Server, + MapPin, + Trash2, + Plus, + Shield, + ShieldCheck, + ShieldX, + Wifi, + WifiOff, + Clock, + Database, + Cloud, + Activity, +} from 'lucide-react'; + +interface Proxy { + id: number; + host: string; + port: number; + protocol: string; + username: string; + city: string | null; + state: string | null; + country: string | null; + country_code: string | null; + active: boolean; + failure_count: number; + consecutive_403_count: number; + response_time_ms: number | null; + last_tested_at: string | null; +} + +interface ProxySession { + worker_id: string; + proxy_ip: string; + proxy_geo: string; + state: string; + started_at: string; + expires_at: string; + tasks_completed: number; +} + +interface EvomiSettings { + enabled: boolean; + api_key_set: boolean; + available_regions: string[]; + available_cities: string[]; +} + +interface TaskWithProxy { + id: number; + worker_id: string; + role: string; + status: string; + proxy_ip: string | null; + proxy_geo: string | null; + proxy_source: 'api' | 'static' | null; + dispensary_name: string; + dispensary_city: string; + dispensary_state: string; + created_at: string; +} + +export function ProxyManagement() { + const [proxies, setProxies] = useState([]); + const [sessions, setSessions] = useState([]); + const [tasks, setTasks] = useState([]); + const [evomiSettings, setEvomiSettings] = useState(null); + const [loading, setLoading] = useState(true); + const [activeTab, setActiveTab] = useState<'sessions' | 'tasks' | 'static'>('sessions'); + const [proxySource, setProxySource] = useState<'api' | 'static'>('api'); + + const fetchData = useCallback(async () => { + try { + setLoading(true); + + // Fetch all proxy-related data + const [proxiesRes, sessionsRes, tasksRes, settingsRes] = await Promise.all([ + api.get('/api/admin/proxies'), + api.get('/api/admin/proxies/sessions'), + api.get('/api/admin/proxies/tasks?limit=100'), + api.get('/api/admin/proxies/evomi-settings'), + ]); + + setProxies(proxiesRes.data.proxies || []); + setSessions(sessionsRes.data.sessions || []); + setTasks(tasksRes.data.tasks || []); + setEvomiSettings(settingsRes.data || null); + setProxySource(settingsRes.data?.enabled ? 'api' : 'static'); + } catch (error) { + console.error('Error fetching proxy data:', error); + } finally { + setLoading(false); + } + }, []); + + useEffect(() => { + fetchData(); + const interval = setInterval(fetchData, 15000); // Refresh every 15s + return () => clearInterval(interval); + }, [fetchData]); + + const getSourceBadge = (source: 'api' | 'static' | null) => { + if (source === 'api') { + return ( + + + Evomi API + + ); + } else if (source === 'static') { + return ( + + + Static + + ); + } + return ( + + - + + ); + }; + + return ( + +
+ {/* Header */} +
+
+ +
+

Proxy Management

+

Geo-targeted proxy sessions and usage tracking

+
+
+
+ {/* Source indicator */} +
+ Source: + {proxySource === 'api' ? ( + + + Evomi API + + ) : ( + + + Static Table + + )} +
+ +
+
+ + {/* Evomi API Status Card */} + {evomiSettings && ( +
+
+
+
+ {evomiSettings.enabled ? ( + + ) : ( + + )} +
+
+

Evomi Residential Proxies

+

+ {evomiSettings.enabled + ? `Dynamic geo-targeting enabled - ${evomiSettings.available_regions.length} regions available` + : 'API not configured - using static proxy table'} +

+
+
+
+
+ {evomiSettings.available_regions.length} US States +
+
+ {evomiSettings.available_cities.length} Cities +
+
+
+
+ )} + + {/* Tabs */} +
+ + + +
+ + {/* Active Sessions Tab */} + {activeTab === 'sessions' && ( +
+ + + + + + + + + + + + + + {sessions.length === 0 ? ( + + + + ) : ( + sessions.map((session, idx) => ( + + + + + + + + + + )) + )} + +
WorkerProxy IPGeo TargetStateStartedExpiresTasks
+ No active proxy sessions +
{session.worker_id}{session.proxy_ip}{session.proxy_geo} + + + {session.state} + + + {new Date(session.started_at).toLocaleTimeString()} + + {new Date(session.expires_at).toLocaleTimeString()} + {session.tasks_completed}
+
+ )} + + {/* Task History Tab */} + {activeTab === 'tasks' && ( +
+ + + + + + + + + + + + + + + + {tasks.length === 0 ? ( + + + + ) : ( + tasks.map((task) => ( + + + + + + + + + + + + )) + )} + +
IDWorkerProxy IPGeoSourceTaskDispensaryLocationStatus
+ No task history with proxy data +
{task.id} + {task.worker_id ? task.worker_id.substring(0, 20) : '-'} + + {task.proxy_ip || '-'} + + {task.proxy_geo || '-'} + {getSourceBadge(task.proxy_source)}{task.role} + {task.dispensary_name} + + {task.dispensary_city}, {task.dispensary_state} + + + {task.status} + +
+
+ )} + + {/* Static Proxies Tab */} + {activeTab === 'static' && ( +
+
+

+ Static proxy list for fallback when Evomi API is unavailable +

+ +
+ + + + + + + + + + + + + + {proxies.length === 0 ? ( + + + + ) : ( + proxies.map((proxy) => ( + + + + + + + + + + )) + )} + +
Host:PortLocationStatusResponse TimeFailures403sActions
+ No static proxies configured. Using Evomi API for geo-targeted proxies. +
+ {proxy.host}:{proxy.port} + + {proxy.city && proxy.state + ? `${proxy.city}, ${proxy.state}` + : proxy.country || '-'} + + {proxy.active ? ( + + + Active + + ) : ( + + + Inactive + + )} + + {proxy.response_time_ms ? `${proxy.response_time_ms}ms` : '-'} + {proxy.failure_count}{proxy.consecutive_403_count} + +
+
+ )} +
+
+ ); +} diff --git a/cannaiq/src/pages/WorkersDashboard.tsx b/cannaiq/src/pages/WorkersDashboard.tsx index 33fc3465..d674485c 100644 --- a/cannaiq/src/pages/WorkersDashboard.tsx +++ b/cannaiq/src/pages/WorkersDashboard.tsx @@ -350,33 +350,23 @@ function PreflightSummary({ worker }: { worker: Worker }) { } if (httpError) tooltipLines.push(`Error: ${httpError}`); - // Qualification styling - GOLD for qualified workers + // Qualification styling - compact with icon badge and geo if (isQualified) { return (
- {/* Qualified badge - GOLD */} -
- - QUALIFIED + {/* Qualified icon + IP on same line */} +
+
+ +
+ {httpIp && ( + {httpIp} + )}
- {/* IP address */} - {httpIp && ( -
- - {httpIp} -
- )} - {/* Fingerprint summary */} - {fingerprint?.browser && ( -
- - {fingerprint.browser} -
- )} - {/* Antidetect status */} -
- - Antidetect OK + {/* Antidetect status with response time */} +
+ Antidetect + OK {httpMs && ({httpMs}ms)}