feat: Platform isolation, Evomi geo-targeting, proxy management

Platform isolation:
- Rename handlers to {task}-{platform}.ts convention
- Deprecate -curl variants (now _deprecated-*)
- Platform-based routing in task-worker.ts
- Add Jane platform handlers and client

Evomi geo-targeting:
- Add dynamic proxy URL builder with state/city targeting
- Session stickiness per worker per state (30 min)
- Fallback to static proxy table when API unavailable
- Add proxy tracking columns to worker_tasks

Proxy management:
- New /proxies admin page for visibility
- Track proxy_ip, proxy_geo, proxy_source per task
- Show active sessions and task history

Validation filtering:
- Filter by validated stores (platform_dispensary_id + menu_url)
- Mark incomplete stores as deprecated
- Update all dashboard/stats queries

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 15:16:48 -07:00
parent 59e0e45f8f
commit c215d11a84
31 changed files with 2698 additions and 129 deletions

View File

@@ -0,0 +1,23 @@
-- Migration: 107_proxy_tracking.sql
-- Description: Add proxy tracking columns to worker_tasks for geo-targeting visibility
-- Created: 2025-12-13
-- Add proxy tracking columns to worker_tasks
ALTER TABLE worker_tasks
ADD COLUMN IF NOT EXISTS proxy_ip VARCHAR(45);
ALTER TABLE worker_tasks
ADD COLUMN IF NOT EXISTS proxy_geo VARCHAR(100);
ALTER TABLE worker_tasks
ADD COLUMN IF NOT EXISTS proxy_source VARCHAR(10);
-- Comments
COMMENT ON COLUMN worker_tasks.proxy_ip IS 'IP address of proxy used for this task';
COMMENT ON COLUMN worker_tasks.proxy_geo IS 'Geo target used (e.g., "arizona", "phoenix, arizona")';
COMMENT ON COLUMN worker_tasks.proxy_source IS 'Source of proxy: "api" (Evomi dynamic) or "static" (fallback table)';
-- Index for proxy analysis
CREATE INDEX IF NOT EXISTS idx_worker_tasks_proxy_ip
ON worker_tasks(proxy_ip)
WHERE proxy_ip IS NOT NULL;

View File

@@ -388,6 +388,9 @@ async function promoteLocation(
country = EXCLUDED.country,
status = EXCLUDED.status,
dutchie_discovery_id = EXCLUDED.dutchie_discovery_id,
dutchie_verified = TRUE,
dutchie_verified_at = COALESCE(dispensaries.dutchie_verified_at, CURRENT_TIMESTAMP),
crawl_enabled = COALESCE(dispensaries.crawl_enabled, TRUE),
last_modified_at = EXCLUDED.last_modified_at,
last_modified_by_task = EXCLUDED.last_modified_by_task,
last_modified_task_id = EXCLUDED.last_modified_task_id,

View File

@@ -0,0 +1,250 @@
/**
* iHeartJane Platform Normalizer
*
* Normalizes raw Jane/Algolia product responses to canonical format.
*
* Jane uses Algolia for product search. Key differences from Dutchie:
* - Product ID is numeric (not MongoDB ObjectId)
* - Prices are per-weight (price_gram, price_eighth_ounce, etc.)
* - Category = strain type (hybrid, indica, sativa)
* - Kind = product type (vape, flower, edible, etc.)
*/
import { BaseNormalizer } from './base';
import {
NormalizedProduct,
NormalizedPricing,
NormalizedAvailability,
NormalizedBrand,
NormalizedCategory,
} from '../types';
export class JaneNormalizer extends BaseNormalizer {
readonly platform = 'jane';
readonly supportedVersions = [1];
// ============================================================
// EXTRACTION
// ============================================================
extractProducts(rawJson: any): any[] {
// Algolia response format: { hits: [...] }
if (rawJson?.hits && Array.isArray(rawJson.hits)) {
return rawJson.hits;
}
// Direct array of products
if (Array.isArray(rawJson)) {
return rawJson;
}
// Products array wrapper
if (rawJson?.products && Array.isArray(rawJson.products)) {
return rawJson.products;
}
// Try data.hits (nested response)
if (rawJson?.data?.hits && Array.isArray(rawJson.data.hits)) {
return rawJson.data.hits;
}
console.warn('[JaneNormalizer] Could not extract products from payload');
return [];
}
validatePayload(rawJson: any): { valid: boolean; errors: string[] } {
const errors: string[] = [];
if (!rawJson) {
errors.push('Payload is null or undefined');
return { valid: false, errors };
}
const products = this.extractProducts(rawJson);
if (products.length === 0) {
errors.push('No products found in payload');
}
// Check for Algolia errors
if (rawJson?.message) {
errors.push(`Algolia error: ${rawJson.message}`);
}
return { valid: errors.length === 0, errors };
}
// ============================================================
// NORMALIZATION
// ============================================================
protected normalizeProduct(rawProduct: any, dispensaryId: number): NormalizedProduct | null {
const externalId = rawProduct.product_id || rawProduct.objectID;
if (!externalId) {
console.warn('[JaneNormalizer] Product missing ID, skipping');
return null;
}
const name = rawProduct.name;
if (!name) {
console.warn(`[JaneNormalizer] Product ${externalId} missing name, skipping`);
return null;
}
return {
externalProductId: String(externalId),
dispensaryId,
platform: 'jane',
platformDispensaryId: '', // Will be set by handler
// Core fields
name,
brandName: rawProduct.brand || null,
brandId: rawProduct.product_brand_id ? String(rawProduct.product_brand_id) : null,
category: rawProduct.kind || null, // Jane's "kind" = product type (vape, flower, etc.)
subcategory: rawProduct.kind_subtype || rawProduct.root_subtype || null,
type: rawProduct.kind || null,
strainType: rawProduct.category || null, // Jane's "category" = strain (hybrid, indica, sativa)
// Potency
thcPercent: rawProduct.percent_thc ?? null,
cbdPercent: rawProduct.percent_cbd ?? null,
thcContent: rawProduct.percent_thc ?? null,
cbdContent: rawProduct.percent_cbd ?? null,
// Status - Jane products in search are always active
status: 'Active',
isActive: true,
medicalOnly: rawProduct.store_types?.includes('medical') && !rawProduct.store_types?.includes('recreational'),
recOnly: rawProduct.store_types?.includes('recreational') && !rawProduct.store_types?.includes('medical'),
// Images
primaryImageUrl: rawProduct.image_urls?.[0] || null,
images: (rawProduct.image_urls || []).map((url: string, i: number) => ({
url,
position: i,
})),
// Raw reference
rawProduct,
};
}
protected normalizePricing(rawProduct: any): NormalizedPricing | null {
const externalId = rawProduct.product_id || rawProduct.objectID;
if (!externalId) return null;
// Jane has multiple price fields by weight
const prices: number[] = [];
const specialPrices: number[] = [];
// Collect all regular prices
if (rawProduct.price_gram) prices.push(rawProduct.price_gram);
if (rawProduct.price_each) prices.push(rawProduct.price_each);
if (rawProduct.price_half_gram) prices.push(rawProduct.price_half_gram);
if (rawProduct.price_eighth_ounce) prices.push(rawProduct.price_eighth_ounce);
if (rawProduct.price_quarter_ounce) prices.push(rawProduct.price_quarter_ounce);
if (rawProduct.price_half_ounce) prices.push(rawProduct.price_half_ounce);
if (rawProduct.price_ounce) prices.push(rawProduct.price_ounce);
if (rawProduct.price_two_gram) prices.push(rawProduct.price_two_gram);
// Collect special/discounted prices
if (rawProduct.special_price_gram) specialPrices.push(rawProduct.special_price_gram);
if (rawProduct.special_price_each) specialPrices.push(rawProduct.special_price_each);
if (rawProduct.discounted_price_gram) specialPrices.push(rawProduct.discounted_price_gram);
if (rawProduct.discounted_price_each) specialPrices.push(rawProduct.discounted_price_each);
// Also check bucket_price and sort_price
if (rawProduct.bucket_price && !prices.includes(rawProduct.bucket_price)) {
prices.push(rawProduct.bucket_price);
}
// Determine if on special
const isOnSpecial = specialPrices.length > 0 || rawProduct.has_brand_discount === true;
// Calculate discount percent
let discountPercent: number | null = null;
if (isOnSpecial && prices.length > 0 && specialPrices.length > 0) {
const regularMin = Math.min(...prices);
const specialMin = Math.min(...specialPrices);
if (regularMin > 0 && specialMin < regularMin) {
discountPercent = Math.round(((regularMin - specialMin) / regularMin) * 100);
}
}
// Get special name from brand_special_prices
let specialName: string | null = null;
if (rawProduct.brand_special_prices) {
const firstSpecial = Object.values(rawProduct.brand_special_prices)[0] as any;
if (firstSpecial?.title) {
specialName = firstSpecial.title;
}
}
return {
externalProductId: String(externalId),
// Use minimum price for display
priceRec: this.toCents(this.getMin(prices)),
priceRecMin: this.toCents(this.getMin(prices)),
priceRecMax: this.toCents(this.getMax(prices)),
priceRecSpecial: this.toCents(this.getMin(specialPrices)),
// Jane doesn't distinguish med pricing in Algolia response
priceMed: null,
priceMedMin: null,
priceMedMax: null,
priceMedSpecial: null,
isOnSpecial,
specialName,
discountPercent,
};
}
protected normalizeAvailability(rawProduct: any): NormalizedAvailability | null {
const externalId = rawProduct.product_id || rawProduct.objectID;
if (!externalId) return null;
// Jane products in Algolia are in-stock (OOS products aren't returned)
const availableForPickup = rawProduct.available_for_pickup ?? true;
const availableForDelivery = rawProduct.available_for_delivery ?? false;
const inStock = availableForPickup || availableForDelivery;
// Jane doesn't expose quantity in Algolia
const quantity = rawProduct.max_cart_quantity || null;
return {
externalProductId: String(externalId),
inStock,
stockStatus: inStock ? 'in_stock' : 'out_of_stock',
quantity,
quantityAvailable: quantity,
isBelowThreshold: false, // Jane doesn't expose this
optionsBelowThreshold: false,
};
}
protected extractBrand(rawProduct: any): NormalizedBrand | null {
const brandName = rawProduct.brand;
if (!brandName) return null;
return {
externalBrandId: rawProduct.product_brand_id ? String(rawProduct.product_brand_id) : null,
name: brandName,
slug: this.slugify(brandName),
logoUrl: rawProduct.brand_logo_url || null,
};
}
protected extractCategory(rawProduct: any): NormalizedCategory | null {
// Use "kind" as the primary category (vape, flower, edible, etc.)
const categoryName = rawProduct.kind;
if (!categoryName) return null;
return {
name: categoryName,
slug: this.slugify(categoryName),
parentCategory: null,
};
}
}

View File

@@ -90,6 +90,7 @@ import publicApiRoutes from './routes/public-api';
import usersRoutes from './routes/users';
import staleProcessesRoutes from './routes/stale-processes';
import orchestratorAdminRoutes from './routes/orchestrator-admin';
import proxyAdminRoutes from './routes/proxy-admin';
import adminDebugRoutes from './routes/admin-debug';
import intelligenceRoutes from './routes/intelligence';
import marketsRoutes from './routes/markets';
@@ -172,6 +173,10 @@ app.use('/api/stale-processes', staleProcessesRoutes);
// Admin routes - orchestrator actions
app.use('/api/admin/orchestrator', orchestratorAdminRoutes);
// Admin routes - proxy management (geo-targeting, sessions, fallback)
app.use('/api/admin/proxies', proxyAdminRoutes);
console.log('[ProxyAdmin] Routes registered at /api/admin/proxies');
// Admin routes - debug endpoints (snapshot inspection)
app.use('/api/admin/debug', adminDebugRoutes);
console.log('[AdminDebug] Routes registered at /api/admin/debug');

View File

@@ -0,0 +1,545 @@
/**
* ============================================================
* iHEARTJANE PLATFORM CLIENT
* ============================================================
*
* Jane uses Cloudflare protection, so all requests must go through
* Puppeteer with stealth plugin. This client manages browser sessions
* and network interception to capture API responses.
*
* API Endpoints:
* - Store: GET https://api.iheartjane.com/v1/stores/{store_id}
* - Products: POST https://search.iheartjane.com/1/indexes/menu-products-production/query (Algolia)
*
* Store ID Format: Numeric (e.g., "2788")
*
* ============================================================
*/
import puppeteer, { Browser, Page } from 'puppeteer';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import puppeteerExtra from 'puppeteer-extra';
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
// Register stealth plugin
puppeteerExtra.use(StealthPlugin());
// ============================================================
// TYPES
// ============================================================
export interface JaneStoreData {
id: number;
name: string;
address: string;
city: string;
state: string;
zip: string;
lat: number;
long: number;
phone: string;
medical: boolean;
recreational: boolean;
product_count: number;
rating: number;
reviews_count: number;
working_hours: Array<{ day: string; period: { from: string; to: string } }>;
url_slug: string;
photo?: string;
// Full raw data preserved
raw: any;
}
export interface JaneProductHit {
product_id: number;
name: string;
brand: string;
kind: string;
category: string;
percent_thc: number | null;
percent_cbd: number | null;
price_gram: number | null;
price_each: number | null;
price_eighth_ounce: number | null;
price_quarter_ounce: number | null;
price_half_ounce: number | null;
price_ounce: number | null;
image_urls: string[];
aggregate_rating: number | null;
review_count: number | null;
available_for_pickup: boolean;
available_for_delivery: boolean;
// Full raw data preserved
raw: any;
}
export interface JaneSession {
sessionId: string;
browser: Browser;
page: Page;
fingerprint: BrowserFingerprint;
proxyUrl: string | null;
startedAt: Date;
storeId?: string;
}
export interface CapturedResponse {
type: 'store' | 'product' | 'algolia' | 'api';
url: string;
data: any;
}
// ============================================================
// CONFIGURATION
// ============================================================
export const JANE_CONFIG = {
storeApiBase: 'https://api.iheartjane.com/v1',
algoliaEndpoint: 'https://search.iheartjane.com/1/indexes/menu-products-production/query',
timeout: 60000,
maxRetries: 3,
navigationTimeout: 60000,
productFetchDelay: 1000,
};
// ============================================================
// SESSION MANAGEMENT
// ============================================================
let currentSession: JaneSession | null = null;
let crawlRotator: CrawlRotator | null = null;
/**
* Set CrawlRotator for proxy/fingerprint management
*/
export function setCrawlRotator(rotator: CrawlRotator | null): void {
crawlRotator = rotator;
if (rotator) {
console.log('[Jane Client] CrawlRotator attached');
}
}
/**
* Get attached CrawlRotator
*/
export function getCrawlRotator(): CrawlRotator | null {
return crawlRotator;
}
/**
* Start a new Jane browser session
* Uses Puppeteer + Stealth plugin to bypass Cloudflare
*/
export async function startSession(storeId?: string): Promise<JaneSession> {
if (currentSession) {
console.log('[Jane Client] Closing existing session before starting new one');
await endSession();
}
// Get fingerprint from rotator or use defaults
let fingerprint: BrowserFingerprint;
let proxyUrl: string | null = null;
if (crawlRotator) {
fingerprint = crawlRotator.userAgent.getCurrent();
const proxy = crawlRotator.proxy.getCurrent();
if (proxy) {
proxyUrl = crawlRotator.proxy.getProxyUrl(proxy);
}
} else {
// Default fingerprint for local testing
fingerprint = {
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
browserName: 'Chrome',
deviceCategory: 'desktop',
platform: 'Windows',
screenWidth: 1920,
screenHeight: 1080,
viewportWidth: 1920,
viewportHeight: 1080,
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
httpFingerprint: {
browserType: 'Chrome' as const,
headers: {},
headerOrder: [],
curlImpersonateBinary: 'curl_chrome131',
hasDNT: false,
},
};
}
// Build browser args
const browserArgs = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
];
if (proxyUrl) {
// Extract host:port from proxy URL for puppeteer
const proxyMatch = proxyUrl.match(/:\/\/([^@]+@)?([^/]+)/);
if (proxyMatch) {
browserArgs.push(`--proxy-server=${proxyMatch[2]}`);
}
}
console.log('[Jane Client] Launching browser...');
const browser = await puppeteerExtra.launch({
headless: true,
args: browserArgs,
});
const page = await browser.newPage();
// Set viewport
await page.setViewport({
width: fingerprint.viewportWidth || 1920,
height: fingerprint.viewportHeight || 1080,
});
// Set user agent
await page.setUserAgent(fingerprint.userAgent);
// Handle proxy authentication if needed
if (proxyUrl) {
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
if (authMatch) {
await page.authenticate({
username: authMatch[1],
password: authMatch[2],
});
}
}
const sessionId = `jane_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
currentSession = {
sessionId,
browser,
page,
fingerprint,
proxyUrl,
startedAt: new Date(),
storeId,
};
console.log(`[Jane Client] Started session ${sessionId}`);
console.log(`[Jane Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`);
if (proxyUrl) {
console.log(`[Jane Client] Proxy: ${proxyUrl.replace(/:[^:@]+@/, ':***@')}`);
}
return currentSession;
}
/**
* End the current browser session
*/
export async function endSession(): Promise<void> {
if (currentSession) {
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
console.log(`[Jane Client] Ending session ${currentSession.sessionId} (${duration}s)`);
try {
await currentSession.browser.close();
} catch (e) {
console.warn('[Jane Client] Error closing browser:', e);
}
currentSession = null;
}
}
/**
* Get current active session
*/
export function getCurrentSession(): JaneSession | null {
return currentSession;
}
// ============================================================
// NETWORK INTERCEPTION
// ============================================================
/**
* Navigate to a Jane menu page and capture API responses
* Returns captured store and product data from network interception
*/
export async function navigateAndCapture(
menuUrl: string,
options: { waitForProducts?: boolean; captureStore?: boolean } = {}
): Promise<{ store?: JaneStoreData; products: JaneProductHit[]; responses: CapturedResponse[] }> {
const { waitForProducts = true, captureStore = true } = options;
if (!currentSession) {
throw new Error('[Jane Client] No active session - call startSession() first');
}
const { page } = currentSession;
const capturedResponses: CapturedResponse[] = [];
let storeData: JaneStoreData | undefined;
const products: JaneProductHit[] = [];
// Enable request interception
await page.setRequestInterception(true);
// Block heavy resources to speed up page load
page.on('request', (req) => {
const type = req.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(type)) {
req.abort();
} else {
req.continue();
}
});
// Capture API responses
page.on('response', async (response) => {
const url = response.url();
const contentType = response.headers()['content-type'] || '';
// Only process JSON responses from Jane domains
if ((url.includes('iheartjane.com') || url.includes('algolia')) && contentType.includes('json')) {
try {
const json = await response.json();
// Categorize response
if (url.includes('/stores/')) {
capturedResponses.push({ type: 'store', url, data: json });
if (captureStore && json.store) {
storeData = parseStoreData(json.store);
console.log(`[Jane Client] Captured store: ${storeData.name} (ID: ${storeData.id})`);
}
} else if (url.includes('algolia') || url.includes('search.iheartjane')) {
capturedResponses.push({ type: 'algolia', url, data: json });
if (json.hits && Array.isArray(json.hits)) {
const newProducts = json.hits.map(parseProductHit);
products.push(...newProducts);
console.log(`[Jane Client] Captured ${json.hits.length} products from Algolia`);
}
} else {
capturedResponses.push({ type: 'api', url, data: json });
}
} catch {
// Not valid JSON, skip
}
}
});
console.log(`[Jane Client] Navigating to ${menuUrl}`);
try {
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout: JANE_CONFIG.navigationTimeout,
});
// Wait for products to load
if (waitForProducts) {
console.log('[Jane Client] Waiting for product data...');
await sleep(3000);
// Try to wait for product selector
try {
await page.waitForSelector('[data-testid="product-card"], .product-card, [class*="ProductCard"]', {
timeout: 10000,
});
} catch {
console.log('[Jane Client] No product cards found via selector (may have loaded via API)');
}
}
} catch (error: any) {
console.error(`[Jane Client] Navigation error: ${error.message}`);
throw error;
}
// Record success if we got data
if (crawlRotator && (storeData || products.length > 0)) {
await crawlRotator.recordSuccess();
}
return { store: storeData, products, responses: capturedResponses };
}
/**
* Fetch store info directly via browser context
* Uses page.evaluate to make fetch request from browser (bypasses Cloudflare)
*/
export async function fetchStoreInfo(storeId: string | number): Promise<JaneStoreData | null> {
if (!currentSession) {
throw new Error('[Jane Client] No active session - call startSession() first');
}
const { page } = currentSession;
const url = `${JANE_CONFIG.storeApiBase}/stores/${storeId}`;
console.log(`[Jane Client] Fetching store info: ${url}`);
try {
const result = await page.evaluate(async (apiUrl: string) => {
try {
const resp = await fetch(apiUrl);
if (resp.ok) {
return await resp.json();
}
return { error: resp.status };
} catch (e: any) {
return { error: e.message };
}
}, url);
if (result.error) {
console.error(`[Jane Client] Store fetch failed: ${result.error}`);
return null;
}
if (result.store) {
const storeData = parseStoreData(result.store);
console.log(`[Jane Client] Got store: ${storeData.name}`);
return storeData;
}
return null;
} catch (error: any) {
console.error(`[Jane Client] Error fetching store: ${error.message}`);
return null;
}
}
/**
* Extract store ID from a Jane menu URL
* Jane URLs don't always contain the store ID directly - we may need to intercept it
*/
export async function extractStoreIdFromUrl(menuUrl: string): Promise<string | null> {
if (!currentSession) {
throw new Error('[Jane Client] No active session - call startSession() first');
}
// Try to extract from URL pattern first (if embedded)
const urlMatch = menuUrl.match(/store[s]?[\/=](\d+)/i);
if (urlMatch) {
return urlMatch[1];
}
// Otherwise, navigate and intercept the store API call
console.log('[Jane Client] Store ID not in URL, intercepting from navigation...');
const { page } = currentSession;
let capturedStoreId: string | null = null;
// Listen for store API calls
const storeIdPromise = new Promise<string | null>((resolve) => {
const timeout = setTimeout(() => resolve(null), 30000);
page.on('response', async (response) => {
const url = response.url();
if (url.includes('/v1/stores/') && !capturedStoreId) {
const match = url.match(/\/stores\/(\d+)/);
if (match) {
capturedStoreId = match[1];
clearTimeout(timeout);
resolve(capturedStoreId);
}
}
});
});
// Enable interception and navigate
await page.setRequestInterception(true);
page.on('request', (req) => {
const type = req.resourceType();
if (['image', 'font', 'media'].includes(type)) {
req.abort();
} else {
req.continue();
}
});
await page.goto(menuUrl, {
waitUntil: 'domcontentloaded',
timeout: JANE_CONFIG.navigationTimeout,
});
// Wait for store ID to be captured
const storeId = await storeIdPromise;
if (storeId) {
console.log(`[Jane Client] Extracted store ID: ${storeId}`);
} else {
console.warn('[Jane Client] Could not extract store ID from URL');
}
return storeId;
}
// ============================================================
// DATA PARSING
// ============================================================
/**
* Parse raw store data into normalized structure
*/
function parseStoreData(raw: any): JaneStoreData {
return {
id: raw.id,
name: raw.name,
address: raw.address || '',
city: raw.city || '',
state: raw.state || '',
zip: raw.zip || '',
lat: raw.lat || 0,
long: raw.long || 0,
phone: raw.phone || '',
medical: raw.medical || false,
recreational: raw.recreational || false,
product_count: raw.product_count || 0,
rating: raw.rating || 0,
reviews_count: raw.reviews_count || 0,
working_hours: raw.working_hours || [],
url_slug: raw.url_slug || '',
photo: raw.photo,
raw,
};
}
/**
* Parse raw Algolia product hit into normalized structure
*/
function parseProductHit(hit: any): JaneProductHit {
return {
product_id: hit.product_id,
name: hit.name,
brand: hit.brand,
kind: hit.kind,
category: hit.category,
percent_thc: hit.percent_thc ?? null,
percent_cbd: hit.percent_cbd ?? null,
price_gram: hit.price_gram ?? null,
price_each: hit.price_each ?? null,
price_eighth_ounce: hit.price_eighth_ounce ?? null,
price_quarter_ounce: hit.price_quarter_ounce ?? null,
price_half_ounce: hit.price_half_ounce ?? null,
price_ounce: hit.price_ounce ?? null,
image_urls: hit.image_urls || [],
aggregate_rating: hit.aggregate_rating ?? null,
review_count: hit.review_count ?? null,
available_for_pickup: hit.available_for_pickup ?? false,
available_for_delivery: hit.available_for_delivery ?? false,
raw: hit,
};
}
// ============================================================
// UTILITY
// ============================================================
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View File

@@ -0,0 +1,48 @@
/**
* iHeartJane Platform Module
*
* Single export point for all Jane communication.
* All Jane workers MUST import from this module.
*/
export {
// Session Management
startSession,
endSession,
getCurrentSession,
// Proxy/Rotation
setCrawlRotator,
getCrawlRotator,
// Core Operations
navigateAndCapture,
fetchStoreInfo,
extractStoreIdFromUrl,
// Configuration
JANE_CONFIG,
// Types
type JaneSession,
type JaneStoreData,
type JaneProductHit,
type CapturedResponse,
} from './client';
// High-level Query Functions
export {
resolveStoreFromUrl,
getStoreById,
fetchProductsFromUrl,
fetchProductsByStoreId,
discoverStoresByState,
// Types
type ResolveStoreResult,
type FetchProductsResult,
type DiscoveredStore,
} from './queries';
// Re-export CrawlRotator types from canonical location
export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator';

View File

@@ -0,0 +1,292 @@
/**
* iHeartJane High-Level Query Functions
*
* Wraps the low-level client methods with business logic
* for common operations like store lookup and product fetching.
*/
import {
startSession,
endSession,
navigateAndCapture,
fetchStoreInfo,
extractStoreIdFromUrl,
JaneStoreData,
JaneProductHit,
CapturedResponse,
} from './client';
// ============================================================
// STORE OPERATIONS
// ============================================================
export interface ResolveStoreResult {
storeId: string;
store: JaneStoreData;
}
/**
* Resolve a menu URL to a Jane store ID and fetch store details
*
* @param menuUrl - The dispensary's menu URL (e.g., https://theflowershopusa.com/mesa/menu/)
* @returns Store ID and store data, or null if not found
*/
export async function resolveStoreFromUrl(menuUrl: string): Promise<ResolveStoreResult | null> {
try {
await startSession();
// Navigate and capture store data
const { store, responses } = await navigateAndCapture(menuUrl, {
waitForProducts: false,
captureStore: true,
});
if (store) {
return {
storeId: String(store.id),
store,
};
}
// If store wasn't captured from navigation, try to extract ID and fetch directly
const storeId = await extractStoreIdFromUrl(menuUrl);
if (storeId) {
const storeData = await fetchStoreInfo(storeId);
if (storeData) {
return {
storeId,
store: storeData,
};
}
}
// Check if we captured any store response
const storeResponse = responses.find((r) => r.type === 'store');
if (storeResponse?.data?.store) {
const storeData = storeResponse.data.store;
return {
storeId: String(storeData.id),
store: storeData,
};
}
return null;
} finally {
await endSession();
}
}
/**
* Get store information by store ID
*
* @param storeId - Jane store ID (numeric string)
* @returns Store data or null if not found
*/
export async function getStoreById(storeId: string | number): Promise<JaneStoreData | null> {
try {
await startSession(String(storeId));
// Need to navigate to a Jane page first to establish browser context
// Use the Jane stores page as a base
const { page } = (await import('./client')).getCurrentSession()!;
await page.goto('https://www.iheartjane.com/stores', {
waitUntil: 'domcontentloaded',
timeout: 30000,
});
// Now fetch store info
return await fetchStoreInfo(storeId);
} finally {
await endSession();
}
}
// ============================================================
// PRODUCT OPERATIONS
// ============================================================
export interface FetchProductsResult {
store?: JaneStoreData;
products: JaneProductHit[];
totalCaptured: number;
responses: CapturedResponse[];
}
/**
* Fetch all products from a Jane menu page
*
* @param menuUrl - The dispensary's menu URL
* @returns Products and store data captured from the page
*/
export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProductsResult> {
try {
await startSession();
const { store, products, responses } = await navigateAndCapture(menuUrl, {
waitForProducts: true,
captureStore: true,
});
return {
store,
products,
totalCaptured: products.length,
responses,
};
} finally {
await endSession();
}
}
/**
* Fetch products for a known store ID
* Constructs a Jane menu URL and fetches products
*
* @param storeId - Jane store ID
* @param urlSlug - Optional URL slug for the store
* @returns Products and store data
*/
export async function fetchProductsByStoreId(
storeId: string | number,
urlSlug?: string
): Promise<FetchProductsResult> {
// Construct a Jane embedded menu URL
// Jane stores can be accessed via: https://www.iheartjane.com/stores/{store_id}/{url_slug}
const slug = urlSlug || 'menu';
const menuUrl = `https://www.iheartjane.com/stores/${storeId}/${slug}`;
return fetchProductsFromUrl(menuUrl);
}
// ============================================================
// DISCOVERY OPERATIONS
// ============================================================
export interface DiscoveredStore {
storeId: string;
name: string;
address: string;
city: string;
state: string;
zip: string;
lat: number;
long: number;
medical: boolean;
recreational: boolean;
productCount: number;
urlSlug: string;
}
/**
* Discover Jane stores in a state
* Navigates to Jane's store locator and extracts store data
*
* @param stateCode - Two-letter state code (e.g., 'AZ')
* @returns Array of discovered stores
*/
export async function discoverStoresByState(stateCode: string): Promise<DiscoveredStore[]> {
const stores: DiscoveredStore[] = [];
try {
await startSession();
const { page } = (await import('./client')).getCurrentSession()!;
// Jane has a store directory at /stores
// Try state-specific URL first
const storeListUrl = `https://www.iheartjane.com/stores?state=${stateCode}`;
console.log(`[Jane Queries] Discovering stores in ${stateCode}: ${storeListUrl}`);
await page.setRequestInterception(true);
// Capture store list responses
const storeResponses: any[] = [];
page.on('request', (req) => {
const type = req.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(type)) {
req.abort();
} else {
req.continue();
}
});
page.on('response', async (response) => {
const url = response.url();
const contentType = response.headers()['content-type'] || '';
if (url.includes('iheartjane.com') && contentType.includes('json')) {
try {
const json = await response.json();
if (json.stores && Array.isArray(json.stores)) {
storeResponses.push(...json.stores);
console.log(`[Jane Queries] Captured ${json.stores.length} stores from API`);
}
} catch {
// Not valid JSON
}
}
});
await page.goto(storeListUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
// Wait for stores to load
await new Promise((r) => setTimeout(r, 3000));
// Parse captured stores
for (const store of storeResponses) {
// Filter by state
if (store.state?.toLowerCase() === stateCode.toLowerCase() ||
store.state?.toLowerCase() === getStateName(stateCode).toLowerCase()) {
stores.push({
storeId: String(store.id),
name: store.name || '',
address: store.address || '',
city: store.city || '',
state: store.state || stateCode,
zip: store.zip || '',
lat: store.lat || 0,
long: store.long || 0,
medical: store.medical || false,
recreational: store.recreational || false,
productCount: store.product_count || 0,
urlSlug: store.url_slug || '',
});
}
}
console.log(`[Jane Queries] Found ${stores.length} stores in ${stateCode}`);
return stores;
} finally {
await endSession();
}
}
// ============================================================
// UTILITY
// ============================================================
/**
* Convert state code to full state name
*/
function getStateName(code: string): string {
const states: Record<string, string> = {
AL: 'Alabama', AK: 'Alaska', AZ: 'Arizona', AR: 'Arkansas', CA: 'California',
CO: 'Colorado', CT: 'Connecticut', DE: 'Delaware', FL: 'Florida', GA: 'Georgia',
HI: 'Hawaii', ID: 'Idaho', IL: 'Illinois', IN: 'Indiana', IA: 'Iowa',
KS: 'Kansas', KY: 'Kentucky', LA: 'Louisiana', ME: 'Maine', MD: 'Maryland',
MA: 'Massachusetts', MI: 'Michigan', MN: 'Minnesota', MS: 'Mississippi', MO: 'Missouri',
MT: 'Montana', NE: 'Nebraska', NV: 'Nevada', NH: 'New Hampshire', NJ: 'New Jersey',
NM: 'New Mexico', NY: 'New York', NC: 'North Carolina', ND: 'North Dakota', OH: 'Ohio',
OK: 'Oklahoma', OR: 'Oregon', PA: 'Pennsylvania', RI: 'Rhode Island', SC: 'South Carolina',
SD: 'South Dakota', TN: 'Tennessee', TX: 'Texas', UT: 'Utah', VT: 'Vermont',
VA: 'Virginia', WA: 'Washington', WV: 'West Virginia', WI: 'Wisconsin', WY: 'Wyoming',
DC: 'District of Columbia',
};
return states[code.toUpperCase()] || code;
}

View File

@@ -130,7 +130,7 @@ router.get('/national/summary', async (req, res) => {
return res.json(cached);
}
// Single optimized query for all state metrics
// Single optimized query for all state metrics (only validated stores)
const { rows: stateMetrics } = await pool.query(`
SELECT
d.state,
@@ -146,6 +146,9 @@ router.get('/national/summary', async (req, res) => {
LEFT JOIN store_products sp ON d.id = sp.dispensary_id
LEFT JOIN states s ON d.state = s.code
WHERE d.state IS NOT NULL
AND d.platform_dispensary_id IS NOT NULL
AND d.menu_url IS NOT NULL
AND (d.stage IS NULL OR d.stage != 'deprecated')
GROUP BY d.state, s.name
ORDER BY store_count DESC
`);

View File

@@ -53,6 +53,7 @@ router.get('/', async (req, res) => {
last_crawl_at,
crawl_enabled,
dutchie_verified,
stage,
created_at,
updated_at
FROM dispensaries
@@ -79,15 +80,17 @@ router.get('/', async (req, res) => {
params.push(state);
}
// Filter by crawl_enabled - defaults to showing only enabled
// Filter by crawl_enabled
if (crawl_enabled === 'false' || crawl_enabled === '0') {
// Explicitly show disabled only
conditions.push(`(crawl_enabled = false OR crawl_enabled IS NULL)`);
} else if (crawl_enabled === 'all') {
// Show all (no filter)
// Show all including deprecated (no validation filter)
} else {
// Default: show only enabled
conditions.push(`crawl_enabled = true`);
// Default: show only validated stores (has location data, not deprecated)
conditions.push(`platform_dispensary_id IS NOT NULL`);
conditions.push(`menu_url IS NOT NULL`);
conditions.push(`(stage IS NULL OR stage != 'deprecated')`);
}
// Filter by dutchie_verified if provided
@@ -140,12 +143,15 @@ router.get('/', async (req, res) => {
}
});
// Get menu type stats
// Get menu type stats (only validated stores)
router.get('/stats/menu-types', async (req, res) => {
try {
const result = await pool.query(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
WHERE platform_dispensary_id IS NOT NULL
AND menu_url IS NOT NULL
AND (stage IS NULL OR stage != 'deprecated')
GROUP BY menu_type
ORDER BY count DESC
`);
@@ -163,6 +169,8 @@ router.get('/stats/crawl-status', async (req, res) => {
let query = `
SELECT
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL AND menu_url IS NOT NULL AND (stage IS NULL OR stage != 'deprecated')) as validated_count,
COUNT(*) FILTER (WHERE stage = 'deprecated') as deprecated_count,
COUNT(*) FILTER (WHERE crawl_enabled = true) as enabled_count,
COUNT(*) FILTER (WHERE crawl_enabled = false OR crawl_enabled IS NULL) as disabled_count,
COUNT(*) FILTER (WHERE dutchie_verified = true) as verified_count,

View File

@@ -26,10 +26,10 @@ router.get('/dashboard', async (req: Request, res: Response) => {
return res.json(dashboardCache.data);
}
// Single optimized query for all counts
// Single optimized query for all counts (only validated stores)
const { rows } = await pool.query(`
SELECT
(SELECT COUNT(*) FROM dispensaries) as dispensary_count,
(SELECT COUNT(*) FROM dispensaries WHERE platform_dispensary_id IS NOT NULL AND menu_url IS NOT NULL AND (stage IS NULL OR stage != 'deprecated')) as dispensary_count,
(SELECT n_live_tup FROM pg_stat_user_tables WHERE relname = 'store_products') as product_count,
(SELECT COUNT(*) FROM (SELECT DISTINCT brand_name_raw FROM store_products WHERE brand_name_raw IS NOT NULL LIMIT 10000) b) as brand_count,
(SELECT COUNT(*) FROM (SELECT DISTINCT category_raw FROM store_products WHERE category_raw IS NOT NULL LIMIT 1000) c) as category_count,

View File

@@ -78,14 +78,17 @@ router.get('/metrics', async (_req: Request, res: Response) => {
/**
* GET /api/admin/orchestrator/states
* Returns array of states with at least one crawl-enabled dispensary
* Returns array of states with at least one validated dispensary
*/
router.get('/states', async (_req: Request, res: Response) => {
try {
const { rows } = await pool.query(`
SELECT DISTINCT state, COUNT(*) as store_count
FROM dispensaries
WHERE state IS NOT NULL AND crawl_enabled = true
WHERE state IS NOT NULL
AND platform_dispensary_id IS NOT NULL
AND menu_url IS NOT NULL
AND (stage IS NULL OR stage != 'deprecated')
GROUP BY state
ORDER BY state
`);

View File

@@ -982,19 +982,24 @@ router.get('/stats', async (_req: Request, res: Response) => {
GROUP BY stage
`);
// Dispensaries by stage
// Dispensaries by stage (only validated stores)
const { rows: dispensaryStats } = await pool.query(`
SELECT stage, COUNT(*) as count
FROM dispensaries
WHERE crawl_enabled = true
WHERE platform_dispensary_id IS NOT NULL
AND menu_url IS NOT NULL
AND (stage IS NULL OR stage != 'deprecated')
GROUP BY stage
`);
// By state for dispensaries
// By state for dispensaries (only validated stores)
const { rows: byState } = await pool.query(`
SELECT state, stage, COUNT(*) as count
FROM dispensaries
WHERE crawl_enabled = true AND state IS NOT NULL
WHERE state IS NOT NULL
AND platform_dispensary_id IS NOT NULL
AND menu_url IS NOT NULL
AND (stage IS NULL OR stage != 'deprecated')
GROUP BY state, stage
ORDER BY state, stage
`);

View File

@@ -0,0 +1,173 @@
import { Router, Request, Response } from 'express';
import { pool } from '../db/pool';
import { authMiddleware } from '../auth/middleware';
const router = Router();
router.use(authMiddleware);
/**
* GET /api/admin/proxies
* Get all static proxies (fallback when API unavailable)
*/
router.get('/', async (_req: Request, res: Response) => {
try {
const { rows } = await pool.query(`
SELECT
id, host, port, protocol, username,
city, state, country, country_code,
active, failure_count, consecutive_403_count,
response_time_ms, last_tested_at
FROM proxies
ORDER BY active DESC, failure_count ASC
`);
res.json({ proxies: rows });
} catch (error: any) {
console.error('[ProxyAdmin] Error fetching proxies:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/admin/proxies/sessions
* Get active proxy sessions (geo-locked workers)
*/
router.get('/sessions', async (_req: Request, res: Response) => {
try {
// For now, return worker preflight data as "sessions"
// TODO: Implement proper session tracking table
const { rows } = await pool.query(`
SELECT
worker_id,
http_ip as proxy_ip,
'US' as proxy_geo,
'AZ' as state,
preflight_http_at as started_at,
preflight_http_at + interval '30 minutes' as expires_at,
tasks_completed as tasks_completed
FROM worker_registry
WHERE status = 'active'
AND http_ip IS NOT NULL
ORDER BY preflight_http_at DESC
`);
res.json({ sessions: rows });
} catch (error: any) {
console.error('[ProxyAdmin] Error fetching sessions:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/admin/proxies/tasks
* Get recent tasks with proxy usage info
*/
router.get('/tasks', async (req: Request, res: Response) => {
try {
const limit = parseInt(req.query.limit as string) || 100;
const { rows } = await pool.query(`
SELECT
t.id,
t.worker_id,
t.role,
t.status,
t.proxy_ip,
t.proxy_geo,
t.proxy_source,
d.name as dispensary_name,
d.city as dispensary_city,
d.state as dispensary_state,
t.created_at
FROM worker_tasks t
LEFT JOIN dispensaries d ON t.dispensary_id = d.id
ORDER BY t.id DESC
LIMIT $1
`, [limit]);
res.json({ tasks: rows });
} catch (error: any) {
console.error('[ProxyAdmin] Error fetching tasks:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/admin/proxies/evomi-settings
* Get Evomi API configuration status
*/
router.get('/evomi-settings', async (_req: Request, res: Response) => {
try {
const apiKeySet = !!process.env.EVOMI_API_KEY;
// US state codes for region mapping
const usStates = [
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
];
// Major US cities we know are available
const majorCities = [
'phoenix', 'tucson', 'los.angeles', 'san.francisco', 'denver',
'chicago', 'new.york', 'miami', 'seattle', 'portland',
'las.vegas', 'boston', 'detroit', 'atlanta', 'dallas'
];
res.json({
enabled: apiKeySet,
api_key_set: apiKeySet,
available_regions: usStates,
available_cities: majorCities,
});
} catch (error: any) {
console.error('[ProxyAdmin] Error fetching Evomi settings:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/admin/proxies
* Add a static proxy (fallback)
*/
router.post('/', async (req: Request, res: Response) => {
try {
const { host, port, protocol, username, password, city, state, country } = req.body;
if (!host || !port) {
return res.status(400).json({ error: 'Host and port are required' });
}
const { rows } = await pool.query(`
INSERT INTO proxies (host, port, protocol, username, password, city, state, country, active)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, true)
RETURNING *
`, [host, port, protocol || 'http', username, password, city, state, country]);
res.json({ proxy: rows[0] });
} catch (error: any) {
console.error('[ProxyAdmin] Error adding proxy:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* DELETE /api/admin/proxies/:id
* Remove a static proxy
*/
router.delete('/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
await pool.query('DELETE FROM proxies WHERE id = $1', [id]);
res.json({ success: true });
} catch (error: any) {
console.error('[ProxyAdmin] Error deleting proxy:', error.message);
res.status(500).json({ error: error.message });
}
});
export default router;

View File

@@ -900,6 +900,168 @@ export interface PreflightResult {
responseTimeMs: number | null;
}
// ============================================================
// EVOMI GEO-TARGETING
// Dynamic proxy URL generation based on dispensary state
// ============================================================
/**
* State code to Evomi region ID mapping
* Format: lowercase, spaces replaced with dots
*/
const STATE_TO_REGION: Record<string, string> = {
'AL': 'alabama', 'AK': 'alaska', 'AZ': 'arizona', 'AR': 'arkansas',
'CA': 'california', 'CO': 'colorado', 'CT': 'connecticut', 'DE': 'delaware',
'FL': 'florida', 'GA': 'georgia', 'HI': 'hawaii', 'ID': 'idaho',
'IL': 'illinois', 'IN': 'indiana', 'IA': 'iowa', 'KS': 'kansas',
'KY': 'kentucky', 'LA': 'louisiana', 'ME': 'maine', 'MD': 'maryland',
'MA': 'massachusetts', 'MI': 'michigan', 'MN': 'minnesota', 'MS': 'mississippi',
'MO': 'missouri', 'MT': 'montana', 'NE': 'nebraska', 'NV': 'nevada',
'NH': 'new.hampshire', 'NJ': 'new.jersey', 'NM': 'new.mexico', 'NY': 'new.york',
'NC': 'north.carolina', 'ND': 'north.dakota', 'OH': 'ohio', 'OK': 'oklahoma',
'OR': 'oregon', 'PA': 'pennsylvania', 'RI': 'rhode.island', 'SC': 'south.carolina',
'SD': 'south.dakota', 'TN': 'tennessee', 'TX': 'texas', 'UT': 'utah',
'VT': 'vermont', 'VA': 'virginia', 'WA': 'washington', 'WV': 'west.virginia',
'WI': 'wisconsin', 'WY': 'wyoming',
// Canadian provinces
'AB': 'alberta', 'BC': 'british.columbia', 'MB': 'manitoba', 'NB': 'new.brunswick',
'NL': 'newfoundland.and.labrador', 'NS': 'nova.scotia', 'ON': 'ontario',
'PE': 'prince.edward.island', 'QC': 'quebec', 'SK': 'saskatchewan',
};
/**
* Evomi proxy configuration from environment
*/
export interface EvomiConfig {
enabled: boolean;
user: string;
pass: string;
host: string;
port: number;
sessionLifetimeMinutes: number;
}
/**
* Get Evomi configuration from environment variables
*/
export function getEvomiConfig(): EvomiConfig {
const user = process.env.EVOMI_USER || '';
const pass = process.env.EVOMI_PASS || '';
return {
enabled: !!(user && pass),
user,
pass,
host: process.env.EVOMI_HOST || 'rpc.evomi.com',
port: parseInt(process.env.EVOMI_PORT || '1000', 10),
sessionLifetimeMinutes: parseInt(process.env.EVOMI_SESSION_LIFETIME || '30', 10),
};
}
/**
* Build a geo-targeted Evomi proxy URL
*
* @param stateCode - US state code (e.g., 'AZ')
* @param workerId - Worker ID for session stickiness
* @param city - Optional city name (lowercase, dots for spaces)
* @returns Proxy URL or null if Evomi not configured
*/
export function buildEvomiProxyUrl(
stateCode: string,
workerId: string,
city?: string
): { url: string; geo: string; source: 'api' } | null {
const config = getEvomiConfig();
if (!config.enabled) {
return null;
}
const region = STATE_TO_REGION[stateCode.toUpperCase()];
if (!region) {
console.warn(`[Evomi] Unknown state code: ${stateCode}`);
return null;
}
// Generate session ID: workerId + region (sticky per worker per state)
const sessionId = `${workerId.replace(/[^a-zA-Z0-9]/g, '').slice(0, 6)}${region.slice(0, 4)}`;
// Build geo target string
let geoParams = `_country-US_region-${region}`;
let geoDisplay = region;
if (city) {
geoParams += `_city-${city}`;
geoDisplay = `${city}, ${region}`;
}
// Build full proxy URL
// Format: http://user:pass_geo_session@host:port
const url = `http://${config.user}:${config.pass}${geoParams}_session-${sessionId}_lifetime-${config.sessionLifetimeMinutes}@${config.host}:${config.port}`;
return {
url,
geo: geoDisplay,
source: 'api' as const,
};
}
/**
* Result from getProxyForTask
*/
export interface GeoProxyResult {
url: string;
ip: string | null; // Will be resolved after first request
geo: string;
source: 'api' | 'static';
}
/**
* Get a geo-targeted proxy for a specific task
* Prefers Evomi dynamic proxies, falls back to static table
*
* @param stateCode - Dispensary state code (e.g., 'AZ')
* @param workerId - Worker ID for session stickiness
* @param city - Optional city name
* @param rotator - CrawlRotator instance for fallback to static proxies
* @returns GeoProxyResult with URL and metadata
*/
export function getProxyForTask(
stateCode: string,
workerId: string,
city?: string,
rotator?: CrawlRotator
): GeoProxyResult | null {
// Try Evomi first
const evomiProxy = buildEvomiProxyUrl(stateCode, workerId, city);
if (evomiProxy) {
console.log(`[GeoProxy] Using Evomi for ${stateCode}: ${evomiProxy.geo}`);
return {
url: evomiProxy.url,
ip: null, // Will be resolved after connection
geo: evomiProxy.geo,
source: 'api',
};
}
// Fall back to static proxy table
if (rotator) {
const staticProxy = rotator.proxy.getCurrent();
if (staticProxy) {
console.log(`[GeoProxy] Falling back to static proxy: ${staticProxy.host}`);
return {
url: rotator.proxy.getProxyUrl(staticProxy),
ip: staticProxy.host,
geo: staticProxy.state || staticProxy.country || 'unknown',
source: 'static',
};
}
}
console.warn(`[GeoProxy] No proxy available for ${stateCode}`);
return null;
}
// ============================================================
// SINGLETON INSTANCES
// ============================================================

View File

@@ -244,15 +244,22 @@ class TaskScheduler {
* - Different crawl frequencies per state (e.g., AZ=4h, MI=6h)
* - Better rate limit management (one state at a time)
* - Easier debugging and monitoring per state
*
* Platform-aware: Each task gets platform from dispensary.menu_type
* (not from schedule.platform) so Jane/Dutchie stores route correctly.
*/
private async generateProductDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
let dispensaryIds: number[] = [];
interface DispensaryRow {
id: number;
menu_type: string | null;
}
let dispensaries: DispensaryRow[] = [];
// Single-dispensary schedule (e.g., "Deeply Rooted Hourly")
if (schedule.dispensary_id) {
// Check if this specific store needs refresh
const result = await pool.query(`
SELECT d.id
SELECT d.id, d.menu_type
FROM dispensaries d
WHERE d.id = $1
AND d.crawl_enabled = true
@@ -266,9 +273,9 @@ class TaskScheduler {
)
`, [schedule.dispensary_id]);
dispensaryIds = result.rows.map((r: { id: number }) => r.id);
dispensaries = result.rows;
if (dispensaryIds.length === 0) {
if (dispensaries.length === 0) {
console.log(`[TaskScheduler] Store ${schedule.dispensary_id} has pending task or is disabled`);
return 0;
}
@@ -277,9 +284,9 @@ class TaskScheduler {
}
// Per-state schedule (e.g., "AZ Product Refresh")
else if (schedule.state_code) {
// Find stores in this state needing refresh
// Find stores in this state needing refresh (include menu_type for platform routing)
const result = await pool.query(`
SELECT d.id
SELECT d.id, d.menu_type
FROM dispensaries d
JOIN states s ON d.state_id = s.id
WHERE d.crawl_enabled = true
@@ -300,14 +307,14 @@ class TaskScheduler {
ORDER BY d.last_fetch_at NULLS FIRST, d.id
`, [schedule.state_code, schedule.interval_hours]);
dispensaryIds = result.rows.map((r: { id: number }) => r.id);
dispensaries = result.rows;
if (dispensaryIds.length === 0) {
if (dispensaries.length === 0) {
console.log(`[TaskScheduler] No stores in ${schedule.state_code} need refresh`);
return 0;
}
console.log(`[TaskScheduler] Creating ${dispensaryIds.length} product_discovery tasks for ${schedule.state_code}`);
console.log(`[TaskScheduler] Creating ${dispensaries.length} product_discovery tasks for ${schedule.state_code}`);
}
// No dispensary_id or state_code - invalid schedule
else {
@@ -315,21 +322,37 @@ class TaskScheduler {
return 0;
}
// Create product_discovery tasks with HTTP transport
// No stagger - worker controls pacing
// Group dispensaries by platform (menu_type)
const byPlatform: Record<string, number[]> = {};
for (const d of dispensaries) {
const platform = d.menu_type || 'dutchie';
if (!byPlatform[platform]) {
byPlatform[platform] = [];
}
byPlatform[platform].push(d.id);
}
// Create tasks per platform so each task routes to the correct handler
let totalCreated = 0;
for (const [platform, ids] of Object.entries(byPlatform)) {
if (ids.length > 0) {
console.log(`[TaskScheduler] Creating ${ids.length} product_discovery tasks for platform=${platform}`);
const { created } = await taskService.createStaggeredTasks(
dispensaryIds,
ids,
'product_discovery',
0, // No stagger - worker controls pacing
schedule.platform || 'dutchie',
platform,
'http', // Force HTTP transport
{
source: 'schedule',
source_schedule_id: schedule.id,
}
);
totalCreated += created;
}
}
return created;
return totalCreated;
}
/**

View File

@@ -1,4 +1,12 @@
/**
* @deprecated DO NOT USE - This file is deprecated.
* Use product-discovery-dutchie.ts instead (HTTP/Puppeteer transport).
*
* This was the old curl-based payload fetch handler.
* Kept for historical reference only.
*
* ---
* Original docs:
* Payload Fetch Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing.

View File

@@ -1,4 +1,12 @@
/**
* @deprecated DO NOT USE - This file is deprecated.
* Use product-discovery-dutchie.ts instead (HTTP/Puppeteer transport).
*
* This was the old curl-based product discovery handler.
* Kept for historical reference only.
*
* ---
* Original docs:
* Product Discovery Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Initial product fetch for newly discovered stores.
@@ -13,7 +21,7 @@
*/
import { TaskContext, TaskResult } from '../task-worker';
import { handlePayloadFetch } from './payload-fetch-curl';
import { handlePayloadFetch } from './_deprecated-payload-fetch-curl';
export async function handleProductDiscovery(ctx: TaskContext): Promise<TaskResult> {
const { task } = ctx;

View File

@@ -1,4 +1,12 @@
/**
* @deprecated DO NOT USE - This file is deprecated.
* Use store-discovery-dutchie.ts instead (HTTP/Puppeteer transport).
*
* This was the old curl-based store discovery handler.
* Kept for historical reference only.
*
* ---
* Original docs:
* Store Discovery Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Discovers new stores and returns their IDs for task chaining.

View File

@@ -0,0 +1,162 @@
/**
* Jane Entry Point Discovery Handler
*
* Resolves a dispensary's menu_url to a Jane store ID (platform_dispensary_id).
*
* Flow:
* 1. Load dispensary with menu_url
* 2. Navigate to menu URL and capture store API call
* 3. Extract store ID from API response
* 4. Update dispensary.platform_dispensary_id
* 5. Queue product_discovery task
*/
import { TaskContext, TaskResult } from '../task-worker';
import {
startSession,
endSession,
setCrawlRotator,
resolveStoreFromUrl,
} from '../../platforms/jane';
import { taskService } from '../task-service';
export async function handleEntryPointDiscoveryJane(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return {
success: false,
error: 'Missing dispensary_id in task',
};
}
console.log(`[JaneEntryPoint] Starting for dispensary ${dispensaryId}`);
try {
// Load dispensary
const dispResult = await pool.query(
`SELECT id, name, menu_url, platform_dispensary_id, menu_type
FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispResult.rows.length === 0) {
return {
success: false,
error: `Dispensary ${dispensaryId} not found`,
};
}
const dispensary = dispResult.rows[0];
// Skip if already resolved
if (dispensary.platform_dispensary_id) {
console.log(`[JaneEntryPoint] Already resolved: ${dispensary.platform_dispensary_id}`);
return {
success: true,
alreadyResolved: true,
storeId: dispensary.platform_dispensary_id,
};
}
if (!dispensary.menu_url) {
return {
success: false,
error: `Dispensary ${dispensaryId} has no menu_url`,
};
}
console.log(`[JaneEntryPoint] Resolving: ${dispensary.menu_url}`);
// Attach crawl rotator
if (crawlRotator) {
setCrawlRotator(crawlRotator);
}
// Resolve store from URL
const result = await resolveStoreFromUrl(dispensary.menu_url);
if (!result) {
// Update dispensary with resolution failure
await pool.query(
`UPDATE dispensaries
SET id_resolution_status = 'failed',
last_id_resolution_at = NOW(),
updated_at = NOW()
WHERE id = $1`,
[dispensaryId]
);
return {
success: false,
error: 'Could not resolve Jane store ID from URL',
};
}
const { storeId, store } = result;
console.log(`[JaneEntryPoint] Resolved store ID: ${storeId} (${store.name})`);
// Update dispensary with resolved store ID and store data
await pool.query(
`UPDATE dispensaries
SET platform_dispensary_id = $1,
menu_type = 'jane',
id_resolution_status = 'resolved',
last_id_resolution_at = NOW(),
stage = 'promoted',
latitude = COALESCE(latitude, $2),
longitude = COALESCE(longitude, $3),
phone = COALESCE(phone, $4),
is_medical = $5,
is_recreational = $6,
updated_at = NOW()
WHERE id = $7`,
[
storeId,
store.lat || null,
store.long || null,
store.phone || null,
store.medical,
store.recreational,
dispensaryId,
]
);
// Queue product_discovery task
console.log(`[JaneEntryPoint] Queuing product_discovery for dispensary ${dispensaryId}`);
await taskService.createTask({
role: 'product_discovery',
dispensary_id: dispensaryId,
platform: 'jane',
method: 'http',
priority: task.priority || 0,
});
return {
success: true,
storeId,
storeName: store.name,
productCount: store.product_count,
queuedProductDiscovery: true,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[JaneEntryPoint] Error:`, errorMessage);
// Update dispensary with failure
await pool.query(
`UPDATE dispensaries
SET id_resolution_status = 'failed',
consecutive_failures = consecutive_failures + 1,
updated_at = NOW()
WHERE id = $1`,
[dispensaryId]
).catch(() => {});
return {
success: false,
error: errorMessage,
};
}
}

View File

@@ -3,18 +3,27 @@
*
* Exports all task handlers for the task worker.
*
* Product Discovery:
* - handleProductDiscoveryCurl: curl/axios based (for curl transport)
* - handleProductDiscoveryHttp: Puppeteer browser-based (for http transport)
* Naming convention: {task}-{platform}.ts
* - product-discovery-dutchie.ts
* - product-discovery-jane.ts
* - store-discovery-dutchie.ts
* - store-discovery-jane.ts
*
* Deprecated handlers (curl transport) are in _deprecated-*.ts files.
*/
export { handleProductDiscovery as handleProductDiscoveryCurl } from './product-discovery-curl';
export { handleProductDiscoveryHttp } from './product-discovery-http';
export { handlePayloadFetch as handlePayloadFetchCurl } from './payload-fetch-curl';
// Shared handlers (platform-agnostic)
export { handleProductRefresh } from './product-refresh';
export { handleStoreDiscovery } from './store-discovery';
export { handleStoreDiscoveryHttp } from './store-discovery-http';
export { handleStoreDiscoveryState } from './store-discovery-state';
export { handleEntryPointDiscovery } from './entry-point-discovery';
export { handleAnalyticsRefresh } from './analytics-refresh';
export { handleWhoami } from './whoami';
// Dutchie Platform Handlers
export { handleProductDiscoveryDutchie } from './product-discovery-dutchie';
export { handleStoreDiscoveryDutchie } from './store-discovery-dutchie';
// Jane Platform Handlers
export { handleStoreDiscoveryJane } from './store-discovery-jane';
export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane';
export { handleProductDiscoveryJane } from './product-discovery-jane';

View File

@@ -1,15 +1,17 @@
/**
* Product Discovery HTTP Handler (Browser-based)
* Product Discovery Handler - Dutchie Platform
*
* Uses Puppeteer + StealthPlugin to fetch products via browser context.
* Based on test-intercept.js pattern from ORGANIC_SCRAPING_GUIDE.md.
*
* Naming convention: {task}-{platform}.ts
*
* This handler:
* 1. Loads dispensary info
* 2. Launches headless browser with proxy (if provided)
* 3. Establishes session by visiting embedded menu
* 4. Fetches ALL products via GraphQL from browser context
* 5. Saves raw payload to filesystem (gzipped)
* 5. Saves raw payload to filesystem (gzipped) at payloads/dutchie/...
* 6. Records metadata in raw_crawl_payloads table
* 7. Queues product_refresh task to process the payload
*
@@ -26,7 +28,7 @@ import { taskService } from '../task-service';
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
export async function handleProductDiscoveryHttp(ctx: TaskContext): Promise<TaskResult> {
export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator, updateStep } = ctx;
const dispensaryId = task.dispensary_id;

View File

@@ -0,0 +1,168 @@
/**
* Jane Product Discovery Handler
*
* Fetches all products from a Jane store via Puppeteer + network interception.
*
* Flow:
* 1. Load dispensary with platform_dispensary_id
* 2. Navigate to menu URL and capture Algolia product responses
* 3. Save raw payload to filesystem
* 4. Queue product_refresh task for normalization
*/
import { TaskContext, TaskResult } from '../task-worker';
import {
startSession,
endSession,
setCrawlRotator,
fetchProductsFromUrl,
} from '../../platforms/jane';
import { saveRawPayload } from '../../utils/payload-storage';
import { taskService } from '../task-service';
export async function handleProductDiscoveryJane(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return {
success: false,
error: 'Missing dispensary_id in task',
};
}
console.log(`[JaneProductDiscovery] Starting for dispensary ${dispensaryId}`);
try {
// Load dispensary
const dispResult = await pool.query(
`SELECT id, name, menu_url, platform_dispensary_id, menu_type
FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispResult.rows.length === 0) {
return {
success: false,
error: `Dispensary ${dispensaryId} not found`,
};
}
const dispensary = dispResult.rows[0];
if (!dispensary.menu_url) {
return {
success: false,
error: `Dispensary ${dispensaryId} has no menu_url`,
};
}
console.log(`[JaneProductDiscovery] Fetching products from: ${dispensary.menu_url}`);
// Attach crawl rotator
if (crawlRotator) {
setCrawlRotator(crawlRotator);
}
// Fetch products
const result = await fetchProductsFromUrl(dispensary.menu_url);
if (result.products.length === 0) {
console.warn(`[JaneProductDiscovery] No products captured for dispensary ${dispensaryId}`);
// Update dispensary with failure
await pool.query(
`UPDATE dispensaries
SET consecutive_failures = consecutive_failures + 1,
updated_at = NOW()
WHERE id = $1`,
[dispensaryId]
);
return {
success: false,
error: 'No products captured from Jane menu page',
productCount: 0,
};
}
console.log(`[JaneProductDiscovery] Captured ${result.products.length} products`);
// Build payload for storage
// Store the raw Algolia hits for the normalizer
const rawPayload = {
hits: result.products.map(p => p.raw), // Use raw product data
store: result.store?.raw || null,
capturedAt: new Date().toISOString(),
platform: 'jane',
dispensaryId,
storeId: dispensary.platform_dispensary_id,
};
// Save raw payload to filesystem (platform = 'jane')
const { id: payloadId, sizeBytes } = await saveRawPayload(
pool,
dispensaryId,
rawPayload,
null, // crawl_run_id
result.products.length,
'jane' // platform
);
console.log(`[JaneProductDiscovery] Saved payload ${payloadId} (${Math.round(sizeBytes / 1024)}KB)`);
// Update dispensary stage and timestamps
await pool.query(
`UPDATE dispensaries
SET stage = 'hydrating',
last_fetch_at = NOW(),
consecutive_successes = consecutive_successes + 1,
consecutive_failures = 0,
updated_at = NOW()
WHERE id = $1`,
[dispensaryId]
);
// Queue product_refresh task for normalization
console.log(`[JaneProductDiscovery] Queuing product_refresh for payload ${payloadId}`);
await taskService.createTask({
role: 'product_refresh',
dispensary_id: dispensaryId,
platform: 'jane',
// method undefined = any worker can process (product_refresh is local)
priority: task.priority || 0,
payload: { payload_id: payloadId },
});
return {
success: true,
productCount: result.products.length,
payloadId,
payloadSizeKB: Math.round(sizeBytes / 1024),
storeInfo: result.store ? {
id: result.store.id,
name: result.store.name,
productCount: result.store.product_count,
} : null,
queuedProductRefresh: true,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[JaneProductDiscovery] Error:`, errorMessage);
// Update dispensary with failure
await pool.query(
`UPDATE dispensaries
SET consecutive_failures = consecutive_failures + 1,
stage = CASE WHEN consecutive_failures >= 2 THEN 'failing' ELSE stage END,
updated_at = NOW()
WHERE id = $1`,
[dispensaryId]
).catch(() => {});
return {
success: false,
error: errorMessage,
};
}
}

View File

@@ -8,19 +8,23 @@
*
* Flow:
* 1. Load payload from filesystem (by payload_id or latest for dispensary)
* 2. Normalize data via DutchieNormalizer
* 3. Upsert to store_products and store_product_snapshots
* 4. Track missing products (increment consecutive_misses, mark OOS at 3)
* 5. Download new product images
* 2. Select normalizer based on dispensary.menu_type (dutchie or jane)
* 3. Normalize data via platform-specific normalizer
* 4. Upsert to store_products and store_product_snapshots
* 5. Track missing products (increment consecutive_misses, mark OOS at 3)
* 6. Download new product images
*
* Benefits of separation:
* - Retry-friendly: If this fails, re-run without re-crawling
* - Replay-able: Run against any historical payload
* - Faster: Local file read vs network call
* - Platform-agnostic: Same handler works for Dutchie and Jane
*/
import { TaskContext, TaskResult } from '../task-worker';
import { DutchieNormalizer } from '../../hydration/normalizers/dutchie';
import { JaneNormalizer } from '../../hydration/normalizers/jane';
import { BaseNormalizer } from '../../hydration/normalizers/base';
import {
upsertStoreProducts,
createStoreProductSnapshots,
@@ -29,7 +33,19 @@ import {
import { loadRawPayloadById, getLatestPayload } from '../../utils/payload-storage';
import { taskService } from '../task-service';
const normalizer = new DutchieNormalizer();
// Platform-aware normalizer registry
const NORMALIZERS: Record<string, BaseNormalizer> = {
dutchie: new DutchieNormalizer(),
jane: new JaneNormalizer(),
};
/**
* Get normalizer for a platform, defaults to Dutchie
*/
function getNormalizer(platform: string | null): BaseNormalizer {
const key = platform || 'dutchie';
return NORMALIZERS[key] || NORMALIZERS.dutchie;
}
export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, updateStep } = ctx;
@@ -90,7 +106,9 @@ export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult
const result = await getLatestPayload(pool, dispensaryId);
if (!result) {
// No payload exists - queue upstream task to fetch products
console.log(`[ProductRefresh] No payload found for dispensary ${dispensaryId} - queuing upstream task`);
// Use dispensary.menu_type for platform routing (jane vs dutchie)
const dispPlatform = dispensary.menu_type || 'dutchie';
console.log(`[ProductRefresh] No payload found for dispensary ${dispensaryId} - queuing upstream task (platform=${dispPlatform})`);
if (dispensary.platform_dispensary_id) {
// Has platform ID - can go straight to product_discovery
@@ -98,12 +116,14 @@ export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult
await taskService.createTask({
role: 'product_discovery',
dispensary_id: dispensaryId,
platform: dispPlatform,
priority: task.priority || 0,
method: 'http', // Use browser-based handler for session proxies
});
return {
success: true,
queued: 'product_discovery',
platform: dispPlatform,
reason: 'No payload exists - queued product_discovery to fetch initial data',
};
} else {
@@ -112,12 +132,14 @@ export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult
await taskService.createTask({
role: 'entry_point_discovery',
dispensary_id: dispensaryId,
platform: dispPlatform,
priority: task.priority || 0,
method: 'http', // Browser-only transport
});
return {
success: true,
queued: 'entry_point_discovery',
platform: dispPlatform,
reason: 'No payload and no platform_dispensary_id - queued entry_point_discovery to resolve ID',
};
}
@@ -167,6 +189,11 @@ export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult
created_at: new Date(),
};
// Select normalizer based on dispensary platform (menu_type)
const platform = dispensary.menu_type || 'dutchie';
const normalizer = getNormalizer(platform);
console.log(`[ProductRefresh] Using ${platform} normalizer for ${dispensary.name}`);
const normalizationResult = normalizer.normalize(rawPayload);
if (normalizationResult.errors.length > 0) {
@@ -179,10 +206,11 @@ export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult
error: 'Normalization produced no products',
payloadId,
productsProcessed: 0,
platform,
};
}
console.log(`[ProductRefresh] Normalized ${normalizationResult.products.length} products`);
console.log(`[ProductRefresh] Normalized ${normalizationResult.products.length} products (platform=${platform})`);
await ctx.heartbeat();

View File

@@ -1,8 +1,9 @@
/**
* Store Discovery HTTP Handler (Browser-based)
* Store Discovery Handler - Dutchie Platform
*
* Uses Puppeteer + StealthPlugin to discover stores via browser context.
* Based on product-discovery-http.ts pattern.
*
* Naming convention: {task}-{platform}.ts
*
* This handler:
* 1. Launches headless browser with proxy (if provided)
@@ -70,7 +71,7 @@ interface DiscoveredLocation {
};
}
export async function handleStoreDiscoveryHttp(ctx: TaskContext): Promise<TaskResult> {
export async function handleStoreDiscoveryDutchie(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator, updateStep } = ctx;
const platform = task.platform || 'dutchie';

View File

@@ -0,0 +1,166 @@
/**
* Jane Store Discovery Handler
*
* Discovers iHeartJane stores by state and inserts them into the dispensaries table.
*
* Flow:
* 1. Navigate to Jane's store directory
* 2. Capture store data from API responses
* 3. Insert new stores into dispensaries table with menu_type='jane'
* 4. Return newStoreIds[] for chaining to product_discovery
*/
import { Pool } from 'pg';
import { TaskContext, TaskResult } from '../task-worker';
import {
startSession,
endSession,
setCrawlRotator,
discoverStoresByState,
DiscoveredStore,
} from '../../platforms/jane';
export async function handleStoreDiscoveryJane(ctx: TaskContext): Promise<TaskResult> {
const { pool, task, crawlRotator } = ctx;
// Get state from task payload
const stateCode = task.payload?.state as string;
if (!stateCode) {
return {
success: false,
error: 'Missing state in task payload',
newStoreIds: [],
};
}
console.log(`[JaneStoreDiscovery] Starting discovery for state: ${stateCode}`);
try {
// Attach crawl rotator if available
if (crawlRotator) {
setCrawlRotator(crawlRotator);
}
// Discover stores
const stores = await discoverStoresByState(stateCode);
if (stores.length === 0) {
console.log(`[JaneStoreDiscovery] No stores found in ${stateCode}`);
return {
success: true,
storesDiscovered: 0,
storesInserted: 0,
newStoreIds: [],
message: `No Jane stores found in ${stateCode}`,
};
}
console.log(`[JaneStoreDiscovery] Found ${stores.length} stores in ${stateCode}`);
// Insert stores into dispensaries table
const { inserted, newIds } = await insertJaneStores(pool, stores, stateCode);
console.log(`[JaneStoreDiscovery] Inserted ${inserted} new stores, ${newIds.length} IDs for chaining`);
return {
success: true,
storesDiscovered: stores.length,
storesInserted: inserted,
newStoreIds: newIds,
state: stateCode,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[JaneStoreDiscovery] Error:`, errorMessage);
return {
success: false,
error: errorMessage,
newStoreIds: [],
};
}
}
/**
* Insert discovered Jane stores into dispensaries table
*/
async function insertJaneStores(
pool: Pool,
stores: DiscoveredStore[],
stateCode: string
): Promise<{ inserted: number; newIds: number[] }> {
const newIds: number[] = [];
let inserted = 0;
for (const store of stores) {
try {
// Build menu URL from store data
const menuUrl = `https://www.iheartjane.com/stores/${store.storeId}/${store.urlSlug || 'menu'}`;
// Upsert into dispensaries
// Use platform_dispensary_id as conflict key to avoid duplicates
const result = await pool.query(
`INSERT INTO dispensaries (
name,
address,
city,
state,
zip,
latitude,
longitude,
menu_url,
menu_type,
platform_dispensary_id,
is_medical,
is_recreational,
stage,
created_at,
updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, NOW(), NOW())
ON CONFLICT (menu_type, platform_dispensary_id)
WHERE menu_type = 'jane' AND platform_dispensary_id IS NOT NULL
DO UPDATE SET
name = EXCLUDED.name,
address = EXCLUDED.address,
city = EXCLUDED.city,
latitude = EXCLUDED.latitude,
longitude = EXCLUDED.longitude,
menu_url = EXCLUDED.menu_url,
is_medical = EXCLUDED.is_medical,
is_recreational = EXCLUDED.is_recreational,
updated_at = NOW()
RETURNING id, (xmax = 0) AS is_new`,
[
store.name,
store.address,
store.city,
stateCode,
store.zip,
store.lat,
store.long,
menuUrl,
'jane',
store.storeId,
store.medical,
store.recreational,
'discovered',
]
);
if (result.rows.length > 0) {
const { id, is_new } = result.rows[0];
if (is_new) {
newIds.push(id);
inserted++;
console.log(`[JaneStoreDiscovery] Inserted: ${store.name} (ID: ${id}, Jane ID: ${store.storeId})`);
} else {
console.log(`[JaneStoreDiscovery] Updated: ${store.name} (ID: ${id})`);
}
}
} catch (error: any) {
// Log but continue with other stores
console.error(`[JaneStoreDiscovery] Error inserting ${store.name}:`, error.message);
}
}
return { inserted, newIds };
}

View File

@@ -17,10 +17,15 @@ export {
export { TaskWorker, TaskContext, TaskResult } from './task-worker';
export {
handleProductDiscoveryCurl,
handleProductDiscoveryHttp,
// Shared handlers
handleProductRefresh,
handleStoreDiscovery,
handleEntryPointDiscovery,
handleAnalyticsRefresh,
// Dutchie handlers
handleProductDiscoveryDutchie,
handleStoreDiscoveryDutchie,
// Jane handlers
handleProductDiscoveryJane,
handleStoreDiscoveryJane,
handleEntryPointDiscoveryJane,
} from './handlers';

View File

@@ -68,19 +68,22 @@ import { runCurlPreflight, CurlPreflightResult } from '../services/curl-prefligh
import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight';
// Task handlers by role
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch and product_refresh are now separate
// Dual-transport: curl vs http (browser-based) handlers
import { handlePayloadFetch } from './handlers/payload-fetch-curl';
// Platform-based handlers: {task}-{platform}.ts convention
import { handleProductRefresh } from './handlers/product-refresh';
import { handleProductDiscovery } from './handlers/product-discovery-curl';
import { handleProductDiscoveryHttp } from './handlers/product-discovery-http';
import { handleStoreDiscovery } from './handlers/store-discovery';
import { handleStoreDiscoveryHttp } from './handlers/store-discovery-http';
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
import { handleWhoami } from './handlers/whoami';
// Dutchie Platform Handlers
import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie';
import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie';
// Jane Platform Handlers
import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
@@ -152,48 +155,66 @@ export interface TaskResult {
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
// Per TASK_WORKFLOW_2024-12-10.md: Handler registry
// payload_fetch: Fetches from Dutchie API, saves to disk
// product_refresh: Reads local payload, normalizes, upserts to DB
// product_discovery: Main handler for product crawling (has curl and http variants)
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
payload_fetch: handlePayloadFetch, // API fetch -> disk (curl)
product_refresh: handleProductRefresh, // disk -> DB
product_discovery: handleProductDiscovery, // Default: curl (see getHandlerForTask for http override)
store_discovery: handleStoreDiscovery,
store_discovery_state: handleStoreDiscoveryState, // Per-state parallelized discovery
// Platform-agnostic handlers (shared across Dutchie and Jane)
// product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB
const SHARED_HANDLERS: Partial<Record<TaskRole, TaskHandler>> = {
product_refresh: handleProductRefresh,
store_discovery_state: handleStoreDiscoveryState,
entry_point_discovery: handleEntryPointDiscovery,
analytics_refresh: handleAnalyticsRefresh,
whoami: handleWhoami, // Tests proxy + anti-detect
whoami: handleWhoami,
};
/**
* Get the appropriate handler for a task, considering both role and method.
* Get the appropriate handler for a task based on platform.
*
* Dual-transport handlers:
* - product_discovery: curl (axios) or http (Puppeteer)
* - store_discovery: curl (axios) or http (Puppeteer)
* Naming convention: {task}-{platform}.ts
* - product-discovery-dutchie.ts
* - product-discovery-jane.ts
* - store-discovery-dutchie.ts
* - store-discovery-jane.ts
*
* Default method is 'http' since all GraphQL queries should use browser transport
* for better TLS fingerprinting and session-based proxy compatibility.
* All handlers use HTTP/Puppeteer transport (curl transport is deprecated).
*/
function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
const role = task.role as TaskRole;
const method = task.method || 'http'; // Default to HTTP for all GraphQL tasks
const platform = task.platform || 'dutchie';
// product_discovery: dual-transport support
if (role === 'product_discovery' && method === 'http') {
console.log(`[TaskWorker] Using HTTP handler for product_discovery (method=${method})`);
return handleProductDiscoveryHttp;
// ==========================================================================
// JANE PLATFORM ROUTING
// ==========================================================================
if (platform === 'jane') {
if (role === 'store_discovery' || role === 'store_discovery_state') {
console.log(`[TaskWorker] Using Jane handler for store_discovery`);
return handleStoreDiscoveryJane;
}
if (role === 'entry_point_discovery') {
console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`);
return handleEntryPointDiscoveryJane;
}
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Jane handler for product_discovery`);
return handleProductDiscoveryJane;
}
}
// store_discovery: dual-transport support
if (role === 'store_discovery' && method === 'http') {
console.log(`[TaskWorker] Using HTTP handler for store_discovery (method=${method})`);
return handleStoreDiscoveryHttp;
// ==========================================================================
// DUTCHIE PLATFORM ROUTING (default)
// ==========================================================================
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for product_discovery`);
return handleProductDiscoveryDutchie;
}
// Default: use the static handler registry (curl-based)
return TASK_HANDLERS[role];
if (role === 'store_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for store_discovery`);
return handleStoreDiscoveryDutchie;
}
// ==========================================================================
// SHARED HANDLERS (platform-agnostic)
// ==========================================================================
return SHARED_HANDLERS[role];
}
/**

View File

@@ -1,17 +1,23 @@
/**
* Payload Storage Utility
*
* Per TASK_WORKFLOW_2024-12-10.md: Store raw GraphQL payloads for historical analysis.
* Per TASK_WORKFLOW_2024-12-10.md: Store raw API payloads for historical analysis.
*
* Design Pattern: Metadata/Payload Separation
* - Metadata in PostgreSQL (raw_crawl_payloads table): Small, indexed, queryable
* - Payload stored in MinIO/S3 (or local filesystem as fallback): Gzipped JSON
*
* Storage structure (MinIO):
* cannaiq/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
* cannaiq/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
*
* Storage structure (Local fallback):
* ./storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
* ./storage/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
*
* Platform values: 'dutchie', 'jane'
*
* Examples:
* payloads/dutchie/2024/12/13/store_456_1734105600000.json.gz
* payloads/jane/2024/12/13/store_2788_1734105600000.json.gz
*
* Benefits:
* - Compare any two crawls to see what changed
@@ -20,6 +26,7 @@
* - DB stays small, backups stay fast
* - ~90% compression (1.5MB -> 150KB per crawl)
* - Shared storage accessible by all worker pods (MinIO)
* - Platform separation for different retention/management policies
*/
import * as fs from 'fs';
@@ -98,23 +105,25 @@ export interface LoadPayloadResult {
/**
* Generate storage path/key for a payload
*
* MinIO format: payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
* Local format: ./storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
* MinIO format: payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
* Local format: ./storage/payloads/{platform}/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
*
* Platform defaults to 'dutchie' for backward compatibility
*/
function generateStoragePath(dispensaryId: number, timestamp: Date): string {
function generateStoragePath(dispensaryId: number, timestamp: Date, platform: string = 'dutchie'): string {
const year = timestamp.getFullYear();
const month = String(timestamp.getMonth() + 1).padStart(2, '0');
const day = String(timestamp.getDate()).padStart(2, '0');
const ts = timestamp.getTime();
const relativePath = `payloads/${year}/${month}/${day}/store_${dispensaryId}_${ts}.json.gz`;
const relativePath = `payloads/${platform}/${year}/${month}/${day}/store_${dispensaryId}_${ts}.json.gz`;
if (useMinIO) {
// MinIO uses forward slashes, no leading slash
return relativePath;
} else {
// Local filesystem uses OS-specific path
return path.join(PAYLOAD_BASE_PATH, String(year), month, day, `store_${dispensaryId}_${ts}.json.gz`);
return path.join(PAYLOAD_BASE_PATH, platform, String(year), month, day, `store_${dispensaryId}_${ts}.json.gz`);
}
}
@@ -138,9 +147,10 @@ function calculateChecksum(data: Buffer): string {
*
* @param pool - Database connection pool
* @param dispensaryId - ID of the dispensary
* @param payload - Raw JSON payload from GraphQL
* @param payload - Raw JSON payload from GraphQL/API
* @param crawlRunId - Optional crawl_run ID for linking
* @param productCount - Number of products in payload
* @param platform - Platform identifier ('dutchie' | 'jane'), defaults to 'dutchie'
* @returns SavePayloadResult with file info and DB record ID
*/
export async function saveRawPayload(
@@ -148,10 +158,11 @@ export async function saveRawPayload(
dispensaryId: number,
payload: any,
crawlRunId: number | null = null,
productCount: number = 0
productCount: number = 0,
platform: string = 'dutchie'
): Promise<SavePayloadResult> {
const timestamp = new Date();
const storagePath = generateStoragePath(dispensaryId, timestamp);
const storagePath = generateStoragePath(dispensaryId, timestamp, platform);
// Serialize and compress
const jsonStr = JSON.stringify(payload);

View File

@@ -47,6 +47,7 @@ import CrossStateCompare from './pages/CrossStateCompare';
import StateDetail from './pages/StateDetail';
import { Discovery } from './pages/Discovery';
import { WorkersDashboard } from './pages/WorkersDashboard';
import { ProxyManagement } from './pages/ProxyManagement';
import TasksDashboard from './pages/TasksDashboard';
import { ScraperOverviewDashboard } from './pages/ScraperOverviewDashboard';
import { SeoOrchestrator } from './pages/admin/seo/SeoOrchestrator';
@@ -124,6 +125,8 @@ export default function App() {
<Route path="/discovery" element={<PrivateRoute><Discovery /></PrivateRoute>} />
{/* Workers Dashboard */}
<Route path="/workers" element={<PrivateRoute><WorkersDashboard /></PrivateRoute>} />
{/* Proxy Management */}
<Route path="/proxies" element={<PrivateRoute><ProxyManagement /></PrivateRoute>} />
{/* Task Queue Dashboard */}
<Route path="/tasks" element={<PrivateRoute><TasksDashboard /></PrivateRoute>} />
{/* Scraper Overview Dashboard (new primary) */}

View File

@@ -0,0 +1,436 @@
import { useState, useEffect, useCallback } from 'react';
import { Layout } from '../components/Layout';
import { api } from '../lib/api';
import {
Globe,
RefreshCw,
Server,
MapPin,
Trash2,
Plus,
Shield,
ShieldCheck,
ShieldX,
Wifi,
WifiOff,
Clock,
Database,
Cloud,
Activity,
} from 'lucide-react';
interface Proxy {
id: number;
host: string;
port: number;
protocol: string;
username: string;
city: string | null;
state: string | null;
country: string | null;
country_code: string | null;
active: boolean;
failure_count: number;
consecutive_403_count: number;
response_time_ms: number | null;
last_tested_at: string | null;
}
interface ProxySession {
worker_id: string;
proxy_ip: string;
proxy_geo: string;
state: string;
started_at: string;
expires_at: string;
tasks_completed: number;
}
interface EvomiSettings {
enabled: boolean;
api_key_set: boolean;
available_regions: string[];
available_cities: string[];
}
interface TaskWithProxy {
id: number;
worker_id: string;
role: string;
status: string;
proxy_ip: string | null;
proxy_geo: string | null;
proxy_source: 'api' | 'static' | null;
dispensary_name: string;
dispensary_city: string;
dispensary_state: string;
created_at: string;
}
export function ProxyManagement() {
const [proxies, setProxies] = useState<Proxy[]>([]);
const [sessions, setSessions] = useState<ProxySession[]>([]);
const [tasks, setTasks] = useState<TaskWithProxy[]>([]);
const [evomiSettings, setEvomiSettings] = useState<EvomiSettings | null>(null);
const [loading, setLoading] = useState(true);
const [activeTab, setActiveTab] = useState<'sessions' | 'tasks' | 'static'>('sessions');
const [proxySource, setProxySource] = useState<'api' | 'static'>('api');
const fetchData = useCallback(async () => {
try {
setLoading(true);
// Fetch all proxy-related data
const [proxiesRes, sessionsRes, tasksRes, settingsRes] = await Promise.all([
api.get('/api/admin/proxies'),
api.get('/api/admin/proxies/sessions'),
api.get('/api/admin/proxies/tasks?limit=100'),
api.get('/api/admin/proxies/evomi-settings'),
]);
setProxies(proxiesRes.data.proxies || []);
setSessions(sessionsRes.data.sessions || []);
setTasks(tasksRes.data.tasks || []);
setEvomiSettings(settingsRes.data || null);
setProxySource(settingsRes.data?.enabled ? 'api' : 'static');
} catch (error) {
console.error('Error fetching proxy data:', error);
} finally {
setLoading(false);
}
}, []);
useEffect(() => {
fetchData();
const interval = setInterval(fetchData, 15000); // Refresh every 15s
return () => clearInterval(interval);
}, [fetchData]);
const getSourceBadge = (source: 'api' | 'static' | null) => {
if (source === 'api') {
return (
<span className="inline-flex items-center gap-1 px-2 py-0.5 rounded-full text-xs font-medium bg-blue-500/20 text-blue-400 border border-blue-500/30">
<Cloud className="w-3 h-3" />
Evomi API
</span>
);
} else if (source === 'static') {
return (
<span className="inline-flex items-center gap-1 px-2 py-0.5 rounded-full text-xs font-medium bg-gray-500/20 text-gray-400 border border-gray-500/30">
<Database className="w-3 h-3" />
Static
</span>
);
}
return (
<span className="inline-flex items-center gap-1 px-2 py-0.5 rounded-full text-xs font-medium bg-gray-700 text-gray-500">
-
</span>
);
};
return (
<Layout>
<div className="p-6">
{/* Header */}
<div className="flex items-center justify-between mb-6">
<div className="flex items-center gap-3">
<Globe className="w-8 h-8 text-emerald-400" />
<div>
<h1 className="text-2xl font-bold text-white">Proxy Management</h1>
<p className="text-gray-400 text-sm">Geo-targeted proxy sessions and usage tracking</p>
</div>
</div>
<div className="flex items-center gap-3">
{/* Source indicator */}
<div className="flex items-center gap-2 px-3 py-2 bg-gray-800 rounded-lg border border-gray-700">
<span className="text-gray-400 text-sm">Source:</span>
{proxySource === 'api' ? (
<span className="flex items-center gap-1.5 text-blue-400">
<Cloud className="w-4 h-4" />
Evomi API
</span>
) : (
<span className="flex items-center gap-1.5 text-gray-400">
<Database className="w-4 h-4" />
Static Table
</span>
)}
</div>
<button
onClick={fetchData}
className="flex items-center gap-2 px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg text-white transition-colors"
>
<RefreshCw className={`w-4 h-4 ${loading ? 'animate-spin' : ''}`} />
Refresh
</button>
</div>
</div>
{/* Evomi API Status Card */}
{evomiSettings && (
<div className="mb-6 p-4 bg-gray-800 rounded-lg border border-gray-700">
<div className="flex items-center justify-between">
<div className="flex items-center gap-3">
<div className={`p-2 rounded-lg ${evomiSettings.enabled ? 'bg-emerald-500/20' : 'bg-gray-700'}`}>
{evomiSettings.enabled ? (
<ShieldCheck className="w-5 h-5 text-emerald-400" />
) : (
<ShieldX className="w-5 h-5 text-gray-500" />
)}
</div>
<div>
<h3 className="text-white font-medium">Evomi Residential Proxies</h3>
<p className="text-gray-400 text-sm">
{evomiSettings.enabled
? `Dynamic geo-targeting enabled - ${evomiSettings.available_regions.length} regions available`
: 'API not configured - using static proxy table'}
</p>
</div>
</div>
<div className="flex items-center gap-4 text-sm">
<div className="text-gray-400">
<span className="text-white font-medium">{evomiSettings.available_regions.length}</span> US States
</div>
<div className="text-gray-400">
<span className="text-white font-medium">{evomiSettings.available_cities.length}</span> Cities
</div>
</div>
</div>
</div>
)}
{/* Tabs */}
<div className="flex gap-2 mb-6">
<button
onClick={() => setActiveTab('sessions')}
className={`px-4 py-2 rounded-lg font-medium transition-colors ${
activeTab === 'sessions'
? 'bg-emerald-500/20 text-emerald-400 border border-emerald-500/30'
: 'bg-gray-800 text-gray-400 hover:bg-gray-700 border border-gray-700'
}`}
>
<div className="flex items-center gap-2">
<Activity className="w-4 h-4" />
Active Sessions
</div>
</button>
<button
onClick={() => setActiveTab('tasks')}
className={`px-4 py-2 rounded-lg font-medium transition-colors ${
activeTab === 'tasks'
? 'bg-emerald-500/20 text-emerald-400 border border-emerald-500/30'
: 'bg-gray-800 text-gray-400 hover:bg-gray-700 border border-gray-700'
}`}
>
<div className="flex items-center gap-2">
<Server className="w-4 h-4" />
Task History
</div>
</button>
<button
onClick={() => setActiveTab('static')}
className={`px-4 py-2 rounded-lg font-medium transition-colors ${
activeTab === 'static'
? 'bg-emerald-500/20 text-emerald-400 border border-emerald-500/30'
: 'bg-gray-800 text-gray-400 hover:bg-gray-700 border border-gray-700'
}`}
>
<div className="flex items-center gap-2">
<Database className="w-4 h-4" />
Static Proxies ({proxies.length})
</div>
</button>
</div>
{/* Active Sessions Tab */}
{activeTab === 'sessions' && (
<div className="bg-gray-800 rounded-lg border border-gray-700 overflow-hidden">
<table className="w-full">
<thead className="bg-gray-900">
<tr>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Worker</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Proxy IP</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Geo Target</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">State</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Started</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Expires</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Tasks</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-700">
{sessions.length === 0 ? (
<tr>
<td colSpan={7} className="px-4 py-8 text-center text-gray-500">
No active proxy sessions
</td>
</tr>
) : (
sessions.map((session, idx) => (
<tr key={idx} className="hover:bg-gray-750">
<td className="px-4 py-3 text-sm font-mono text-white">{session.worker_id}</td>
<td className="px-4 py-3 text-sm font-mono text-emerald-400">{session.proxy_ip}</td>
<td className="px-4 py-3 text-sm text-gray-300">{session.proxy_geo}</td>
<td className="px-4 py-3">
<span className="inline-flex items-center gap-1 px-2 py-0.5 rounded bg-blue-500/20 text-blue-400 text-xs font-medium">
<MapPin className="w-3 h-3" />
{session.state}
</span>
</td>
<td className="px-4 py-3 text-sm text-gray-400">
{new Date(session.started_at).toLocaleTimeString()}
</td>
<td className="px-4 py-3 text-sm text-gray-400">
{new Date(session.expires_at).toLocaleTimeString()}
</td>
<td className="px-4 py-3 text-sm text-white">{session.tasks_completed}</td>
</tr>
))
)}
</tbody>
</table>
</div>
)}
{/* Task History Tab */}
{activeTab === 'tasks' && (
<div className="bg-gray-800 rounded-lg border border-gray-700 overflow-hidden">
<table className="w-full">
<thead className="bg-gray-900">
<tr>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">ID</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Worker</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Proxy IP</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Geo</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Source</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Task</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Dispensary</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Location</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Status</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-700">
{tasks.length === 0 ? (
<tr>
<td colSpan={9} className="px-4 py-8 text-center text-gray-500">
No task history with proxy data
</td>
</tr>
) : (
tasks.map((task) => (
<tr key={task.id} className="hover:bg-gray-750">
<td className="px-4 py-3 text-sm font-mono text-gray-400">{task.id}</td>
<td className="px-4 py-3 text-sm font-mono text-white">
{task.worker_id ? task.worker_id.substring(0, 20) : '-'}
</td>
<td className="px-4 py-3 text-sm font-mono text-emerald-400">
{task.proxy_ip || '-'}
</td>
<td className="px-4 py-3 text-sm text-gray-300">
{task.proxy_geo || '-'}
</td>
<td className="px-4 py-3">{getSourceBadge(task.proxy_source)}</td>
<td className="px-4 py-3 text-sm text-gray-300">{task.role}</td>
<td className="px-4 py-3 text-sm text-white max-w-[200px] truncate">
{task.dispensary_name}
</td>
<td className="px-4 py-3 text-sm text-gray-400">
{task.dispensary_city}, {task.dispensary_state}
</td>
<td className="px-4 py-3">
<span className={`inline-flex px-2 py-0.5 rounded text-xs font-medium ${
task.status === 'completed'
? 'bg-emerald-500/20 text-emerald-400'
: task.status === 'failed'
? 'bg-red-500/20 text-red-400'
: task.status === 'running'
? 'bg-blue-500/20 text-blue-400'
: 'bg-gray-500/20 text-gray-400'
}`}>
{task.status}
</span>
</td>
</tr>
))
)}
</tbody>
</table>
</div>
)}
{/* Static Proxies Tab */}
{activeTab === 'static' && (
<div className="bg-gray-800 rounded-lg border border-gray-700 overflow-hidden">
<div className="p-4 border-b border-gray-700 flex items-center justify-between">
<p className="text-gray-400 text-sm">
Static proxy list for fallback when Evomi API is unavailable
</p>
<button className="flex items-center gap-2 px-3 py-1.5 bg-emerald-600 hover:bg-emerald-500 rounded text-white text-sm transition-colors">
<Plus className="w-4 h-4" />
Add Proxy
</button>
</div>
<table className="w-full">
<thead className="bg-gray-900">
<tr>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Host:Port</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Location</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Status</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Response Time</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Failures</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">403s</th>
<th className="px-4 py-3 text-left text-xs font-medium text-gray-400 uppercase">Actions</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-700">
{proxies.length === 0 ? (
<tr>
<td colSpan={7} className="px-4 py-8 text-center text-gray-500">
No static proxies configured. Using Evomi API for geo-targeted proxies.
</td>
</tr>
) : (
proxies.map((proxy) => (
<tr key={proxy.id} className="hover:bg-gray-750">
<td className="px-4 py-3 text-sm font-mono text-white">
{proxy.host}:{proxy.port}
</td>
<td className="px-4 py-3 text-sm text-gray-300">
{proxy.city && proxy.state
? `${proxy.city}, ${proxy.state}`
: proxy.country || '-'}
</td>
<td className="px-4 py-3">
{proxy.active ? (
<span className="inline-flex items-center gap-1 text-emerald-400 text-sm">
<Wifi className="w-3 h-3" />
Active
</span>
) : (
<span className="inline-flex items-center gap-1 text-gray-500 text-sm">
<WifiOff className="w-3 h-3" />
Inactive
</span>
)}
</td>
<td className="px-4 py-3 text-sm text-gray-400">
{proxy.response_time_ms ? `${proxy.response_time_ms}ms` : '-'}
</td>
<td className="px-4 py-3 text-sm text-gray-400">{proxy.failure_count}</td>
<td className="px-4 py-3 text-sm text-gray-400">{proxy.consecutive_403_count}</td>
<td className="px-4 py-3">
<button className="p-1.5 text-gray-400 hover:text-red-400 transition-colors">
<Trash2 className="w-4 h-4" />
</button>
</td>
</tr>
))
)}
</tbody>
</table>
</div>
)}
</div>
</Layout>
);
}

View File

@@ -350,33 +350,23 @@ function PreflightSummary({ worker }: { worker: Worker }) {
}
if (httpError) tooltipLines.push(`Error: ${httpError}`);
// Qualification styling - GOLD for qualified workers
// Qualification styling - compact with icon badge and geo
if (isQualified) {
return (
<div className="flex flex-col gap-1" title={tooltipLines.join('\n')}>
{/* Qualified badge - GOLD */}
<div className="inline-flex items-center gap-1.5 px-2 py-1 rounded-lg bg-gradient-to-r from-amber-100 to-yellow-100 border border-amber-300">
<ShieldCheck className="w-4 h-4 text-amber-600" />
<span className="text-xs font-bold text-amber-700">QUALIFIED</span>
{/* Qualified icon + IP on same line */}
<div className="inline-flex items-center gap-1.5">
<div className="p-1 rounded bg-gradient-to-r from-amber-100 to-yellow-100 border border-amber-300">
<ShieldCheck className="w-3.5 h-3.5 text-amber-600" />
</div>
{/* IP address */}
{httpIp && (
<div className="flex items-center gap-1 text-xs text-gray-600">
<Globe className="w-3 h-3 text-blue-500" />
<span className="font-mono">{httpIp}</span>
</div>
<span className="font-mono text-xs text-gray-600">{httpIp}</span>
)}
{/* Fingerprint summary */}
{fingerprint?.browser && (
</div>
{/* Antidetect status with response time */}
<div className="flex items-center gap-1 text-xs text-gray-500">
<Fingerprint className="w-3 h-3 text-purple-500" />
<span className="truncate max-w-[100px]">{fingerprint.browser}</span>
</div>
)}
{/* Antidetect status */}
<div className="flex items-center gap-1 text-xs">
<Shield className="w-3 h-3 text-emerald-500" />
<span className="text-emerald-600">Antidetect OK</span>
<span className="text-emerald-600">Antidetect</span>
<span className="text-emerald-500">OK</span>
{httpMs && <span className="text-gray-400">({httpMs}ms)</span>}
</div>
</div>