fix(preflight): Apply stored fingerprint to task browser
- Add WorkerFingerprint interface with timezone, city, state, ip, locale - Store fingerprint in TaskWorker after preflight passes - Pass fingerprint through TaskContext to handlers - Apply timezone via CDP and locale via Accept-Language header - Ensures browser fingerprint matches proxy IP location This fixes anti-detect detection where timezone/locale mismatch with proxy IP was getting blocked by Cloudflare. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
227
backend/src/hydration/normalizers/treez.ts
Normal file
227
backend/src/hydration/normalizers/treez.ts
Normal file
@@ -0,0 +1,227 @@
|
||||
/**
|
||||
* Treez Platform Normalizer
|
||||
*
|
||||
* Normalizes raw Treez DOM-scraped product data to canonical format.
|
||||
*
|
||||
* Treez is scraped via Puppeteer (no API), so the raw format is
|
||||
* the TreezProductRaw interface from our client.
|
||||
*
|
||||
* Key differences from Dutchie/Jane:
|
||||
* - Data comes from DOM parsing, not API response
|
||||
* - Price is a single value (not multiple weights like Jane)
|
||||
* - Product ID is generated from product name or URL slug
|
||||
* - Less structured data (category/strain inferred from text)
|
||||
*/
|
||||
|
||||
import { BaseNormalizer } from './base';
|
||||
import {
|
||||
NormalizedProduct,
|
||||
NormalizedPricing,
|
||||
NormalizedAvailability,
|
||||
NormalizedBrand,
|
||||
NormalizedCategory,
|
||||
} from '../types';
|
||||
|
||||
export class TreezNormalizer extends BaseNormalizer {
|
||||
readonly platform = 'treez';
|
||||
readonly supportedVersions = [1];
|
||||
|
||||
// ============================================================
|
||||
// EXTRACTION
|
||||
// ============================================================
|
||||
|
||||
extractProducts(rawJson: any): any[] {
|
||||
// Treez payload format: { products: [...] }
|
||||
if (rawJson?.products && Array.isArray(rawJson.products)) {
|
||||
return rawJson.products;
|
||||
}
|
||||
|
||||
// Direct array of products
|
||||
if (Array.isArray(rawJson)) {
|
||||
return rawJson;
|
||||
}
|
||||
|
||||
// Hits array (normalized format)
|
||||
if (rawJson?.hits && Array.isArray(rawJson.hits)) {
|
||||
return rawJson.hits;
|
||||
}
|
||||
|
||||
console.warn('[TreezNormalizer] Could not extract products from payload');
|
||||
return [];
|
||||
}
|
||||
|
||||
validatePayload(rawJson: any): { valid: boolean; errors: string[] } {
|
||||
const errors: string[] = [];
|
||||
|
||||
if (!rawJson) {
|
||||
errors.push('Payload is null or undefined');
|
||||
return { valid: false, errors };
|
||||
}
|
||||
|
||||
const products = this.extractProducts(rawJson);
|
||||
if (products.length === 0) {
|
||||
errors.push('No products found in payload');
|
||||
}
|
||||
|
||||
return { valid: errors.length === 0, errors };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NORMALIZATION
|
||||
// ============================================================
|
||||
|
||||
protected normalizeProduct(rawProduct: any, dispensaryId: number): NormalizedProduct | null {
|
||||
const externalId = rawProduct.productId;
|
||||
if (!externalId) {
|
||||
console.warn('[TreezNormalizer] Product missing ID, skipping');
|
||||
return null;
|
||||
}
|
||||
|
||||
const name = rawProduct.name;
|
||||
if (!name) {
|
||||
console.warn(`[TreezNormalizer] Product ${externalId} missing name, skipping`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
externalProductId: String(externalId),
|
||||
dispensaryId,
|
||||
platform: 'treez',
|
||||
platformDispensaryId: '', // Will be set by handler
|
||||
|
||||
// Core fields
|
||||
name,
|
||||
brandName: rawProduct.brand || null,
|
||||
brandId: null, // Treez doesn't expose brand IDs
|
||||
category: this.normalizeCategory(rawProduct.category) || null,
|
||||
subcategory: rawProduct.subcategory || null,
|
||||
type: rawProduct.category || null,
|
||||
strainType: rawProduct.subcategory || null, // indica, sativa, hybrid
|
||||
|
||||
// Potency
|
||||
thcPercent: rawProduct.thcPercent ?? null,
|
||||
cbdPercent: rawProduct.cbdPercent ?? null,
|
||||
thcContent: rawProduct.thcPercent ?? null,
|
||||
cbdContent: rawProduct.cbdPercent ?? null,
|
||||
|
||||
// Status - scraped products are active
|
||||
status: 'Active',
|
||||
isActive: rawProduct.inStock !== false,
|
||||
medicalOnly: false,
|
||||
recOnly: false,
|
||||
|
||||
// Images
|
||||
primaryImageUrl: rawProduct.imageUrl || null,
|
||||
images: rawProduct.imageUrl
|
||||
? [{ url: rawProduct.imageUrl, position: 0 }]
|
||||
: [],
|
||||
|
||||
// Raw reference
|
||||
rawProduct,
|
||||
};
|
||||
}
|
||||
|
||||
protected normalizePricing(rawProduct: any): NormalizedPricing | null {
|
||||
const externalId = rawProduct.productId;
|
||||
if (!externalId) return null;
|
||||
|
||||
const price = rawProduct.price;
|
||||
|
||||
return {
|
||||
externalProductId: String(externalId),
|
||||
|
||||
// Treez typically shows a single price
|
||||
priceRec: this.toCents(price),
|
||||
priceRecMin: this.toCents(price),
|
||||
priceRecMax: this.toCents(price),
|
||||
priceRecSpecial: null,
|
||||
|
||||
// Treez doesn't distinguish med pricing in DOM
|
||||
priceMed: null,
|
||||
priceMedMin: null,
|
||||
priceMedMax: null,
|
||||
priceMedSpecial: null,
|
||||
|
||||
isOnSpecial: false,
|
||||
specialName: null,
|
||||
discountPercent: null,
|
||||
};
|
||||
}
|
||||
|
||||
protected normalizeAvailability(rawProduct: any): NormalizedAvailability | null {
|
||||
const externalId = rawProduct.productId;
|
||||
if (!externalId) return null;
|
||||
|
||||
const inStock = rawProduct.inStock !== false;
|
||||
|
||||
return {
|
||||
externalProductId: String(externalId),
|
||||
inStock,
|
||||
stockStatus: inStock ? 'in_stock' : 'out_of_stock',
|
||||
quantity: null, // Treez doesn't expose quantity in DOM
|
||||
quantityAvailable: null,
|
||||
isBelowThreshold: false,
|
||||
optionsBelowThreshold: false,
|
||||
};
|
||||
}
|
||||
|
||||
protected extractBrand(rawProduct: any): NormalizedBrand | null {
|
||||
const brandName = rawProduct.brand;
|
||||
if (!brandName) return null;
|
||||
|
||||
return {
|
||||
externalBrandId: null, // Treez doesn't expose brand IDs
|
||||
name: brandName,
|
||||
slug: this.slugify(brandName),
|
||||
logoUrl: null,
|
||||
};
|
||||
}
|
||||
|
||||
protected extractCategory(rawProduct: any): NormalizedCategory | null {
|
||||
const categoryName = rawProduct.category;
|
||||
if (!categoryName) return null;
|
||||
|
||||
return {
|
||||
name: this.normalizeCategory(categoryName) || categoryName,
|
||||
slug: this.slugify(categoryName),
|
||||
parentCategory: null,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Normalize category name to standard format
|
||||
*/
|
||||
private normalizeCategory(category: string | null | undefined): string | null {
|
||||
if (!category) return null;
|
||||
|
||||
const categoryLower = category.toLowerCase().trim();
|
||||
|
||||
const categoryMap: Record<string, string> = {
|
||||
flower: 'Flower',
|
||||
vape: 'Vape',
|
||||
vapes: 'Vape',
|
||||
cartridge: 'Vape',
|
||||
edible: 'Edible',
|
||||
edibles: 'Edible',
|
||||
concentrate: 'Concentrate',
|
||||
concentrates: 'Concentrate',
|
||||
'pre-roll': 'Pre-Roll',
|
||||
preroll: 'Pre-Roll',
|
||||
'pre-rolls': 'Pre-Roll',
|
||||
prerolls: 'Pre-Roll',
|
||||
topical: 'Topical',
|
||||
topicals: 'Topical',
|
||||
tincture: 'Tincture',
|
||||
tinctures: 'Tincture',
|
||||
accessory: 'Accessory',
|
||||
accessories: 'Accessory',
|
||||
gear: 'Gear',
|
||||
};
|
||||
|
||||
return categoryMap[categoryLower] || category;
|
||||
}
|
||||
}
|
||||
570
backend/src/platforms/treez/client.ts
Normal file
570
backend/src/platforms/treez/client.ts
Normal file
@@ -0,0 +1,570 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* TREEZ PLATFORM CLIENT
|
||||
* ============================================================
|
||||
*
|
||||
* Treez is a fully client-side rendered platform (React/Next.js).
|
||||
* Unlike Dutchie (GraphQL) or Jane (Algolia), Treez requires DOM
|
||||
* parsing after page render. No API endpoints are available.
|
||||
*
|
||||
* Key differences:
|
||||
* - No Cloudflare protection (simpler than Jane)
|
||||
* - Products loaded via infinite scroll
|
||||
* - Data extracted from DOM elements
|
||||
* - Age gate must be bypassed
|
||||
*
|
||||
* URL Pattern: https://{storeId}.treez.io/onlinemenu/?customerType=ADULT
|
||||
* Store ID Format: String slug (e.g., "best")
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||
import puppeteerExtra from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||
|
||||
// Register stealth plugin (good practice even without Cloudflare)
|
||||
puppeteerExtra.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface TreezProductRaw {
|
||||
productId: string;
|
||||
name: string;
|
||||
brand: string;
|
||||
category: string;
|
||||
subcategory: string; // indica, sativa, hybrid
|
||||
thcPercent: number | null;
|
||||
cbdPercent: number | null;
|
||||
price: number | null;
|
||||
priceUnit: string;
|
||||
imageUrl: string | null;
|
||||
inStock: boolean;
|
||||
weight: string | null;
|
||||
}
|
||||
|
||||
export interface TreezSession {
|
||||
sessionId: string;
|
||||
browser: Browser;
|
||||
page: Page;
|
||||
fingerprint: BrowserFingerprint;
|
||||
proxyUrl: string | null;
|
||||
startedAt: Date;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
export interface TreezStoreInfo {
|
||||
storeId: string;
|
||||
name: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
export const TREEZ_CONFIG = {
|
||||
baseUrl: 'https://{storeId}.treez.io/onlinemenu/',
|
||||
timeout: 60000,
|
||||
navigationTimeout: 60000,
|
||||
scrollDelay: 1500,
|
||||
maxScrollAttempts: 50,
|
||||
ageGateDelay: 2000,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// SESSION MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
let currentSession: TreezSession | null = null;
|
||||
let crawlRotator: CrawlRotator | null = null;
|
||||
|
||||
/**
|
||||
* Set CrawlRotator for proxy/fingerprint management
|
||||
*/
|
||||
export function setCrawlRotator(rotator: CrawlRotator | null): void {
|
||||
crawlRotator = rotator;
|
||||
if (rotator) {
|
||||
console.log('[Treez Client] CrawlRotator attached');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get attached CrawlRotator
|
||||
*/
|
||||
export function getCrawlRotator(): CrawlRotator | null {
|
||||
return crawlRotator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a new Treez browser session
|
||||
*/
|
||||
export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
if (currentSession) {
|
||||
console.log('[Treez Client] Closing existing session before starting new one');
|
||||
await endSession();
|
||||
}
|
||||
|
||||
// Get fingerprint from rotator or use defaults
|
||||
let fingerprint: BrowserFingerprint;
|
||||
let proxyUrl: string | null = null;
|
||||
|
||||
if (crawlRotator) {
|
||||
fingerprint = crawlRotator.userAgent.getCurrent();
|
||||
const proxy = crawlRotator.proxy.getCurrent();
|
||||
if (proxy) {
|
||||
proxyUrl = crawlRotator.proxy.getProxyUrl(proxy);
|
||||
}
|
||||
} else {
|
||||
// Default fingerprint for local testing
|
||||
fingerprint = {
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
browserName: 'Chrome',
|
||||
deviceCategory: 'desktop',
|
||||
platform: 'Windows',
|
||||
screenWidth: 1920,
|
||||
screenHeight: 1080,
|
||||
viewportWidth: 1920,
|
||||
viewportHeight: 1080,
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
secChUaPlatform: '"Windows"',
|
||||
secChUaMobile: '?0',
|
||||
httpFingerprint: {
|
||||
browserType: 'Chrome' as const,
|
||||
headers: {},
|
||||
headerOrder: [],
|
||||
curlImpersonateBinary: 'curl_chrome131',
|
||||
hasDNT: false,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Build browser args
|
||||
const browserArgs = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
];
|
||||
|
||||
if (proxyUrl) {
|
||||
const proxyMatch = proxyUrl.match(/:\/\/([^@]+@)?([^/]+)/);
|
||||
if (proxyMatch) {
|
||||
browserArgs.push(`--proxy-server=${proxyMatch[2]}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[Treez Client] Launching browser...');
|
||||
const browser = await puppeteerExtra.launch({
|
||||
headless: true,
|
||||
args: browserArgs,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set viewport
|
||||
await page.setViewport({
|
||||
width: fingerprint.viewportWidth || 1920,
|
||||
height: fingerprint.viewportHeight || 1080,
|
||||
});
|
||||
|
||||
// Set user agent
|
||||
await page.setUserAgent(fingerprint.userAgent);
|
||||
|
||||
// Block unnecessary resources to save bandwidth
|
||||
// We only need HTML/JS for DOM extraction - not images, fonts, etc.
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const resourceType = request.resourceType();
|
||||
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Handle proxy authentication if needed
|
||||
if (proxyUrl) {
|
||||
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
|
||||
if (authMatch) {
|
||||
await page.authenticate({
|
||||
username: authMatch[1],
|
||||
password: authMatch[2],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||
|
||||
currentSession = {
|
||||
sessionId,
|
||||
browser,
|
||||
page,
|
||||
fingerprint,
|
||||
proxyUrl,
|
||||
startedAt: new Date(),
|
||||
storeId,
|
||||
};
|
||||
|
||||
console.log(`[Treez Client] Started session ${sessionId}`);
|
||||
console.log(`[Treez Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`);
|
||||
if (proxyUrl) {
|
||||
console.log(`[Treez Client] Proxy: ${proxyUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
}
|
||||
|
||||
return currentSession;
|
||||
}
|
||||
|
||||
/**
|
||||
* End the current browser session
|
||||
*/
|
||||
export async function endSession(): Promise<void> {
|
||||
if (currentSession) {
|
||||
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
|
||||
console.log(`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s)`);
|
||||
|
||||
try {
|
||||
await currentSession.browser.close();
|
||||
} catch (e) {
|
||||
console.warn('[Treez Client] Error closing browser:', e);
|
||||
}
|
||||
|
||||
currentSession = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current active session
|
||||
*/
|
||||
export function getCurrentSession(): TreezSession | null {
|
||||
return currentSession;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// AGE GATE HANDLING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Bypass age gate if present
|
||||
*/
|
||||
export async function bypassAgeGate(page: Page): Promise<boolean> {
|
||||
console.log('[Treez Client] Checking for age gate...');
|
||||
|
||||
try {
|
||||
const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]');
|
||||
|
||||
if (ageGate) {
|
||||
console.log('[Treez Client] Age gate detected, clicking confirm button...');
|
||||
|
||||
const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]');
|
||||
if (submitBtn) {
|
||||
await submitBtn.click();
|
||||
console.log('[Treez Client] Clicked confirm button');
|
||||
|
||||
await sleep(TREEZ_CONFIG.ageGateDelay);
|
||||
|
||||
// Wait for age gate to disappear
|
||||
await page.waitForFunction(
|
||||
() => !document.querySelector('[data-testid="age-gate-modal"]'),
|
||||
{ timeout: 10000 }
|
||||
).catch(() => {
|
||||
console.log('[Treez Client] Gate may still be visible, continuing anyway');
|
||||
});
|
||||
|
||||
console.log('[Treez Client] Age gate bypassed');
|
||||
return true;
|
||||
} else {
|
||||
console.log('[Treez Client] No submit button found');
|
||||
}
|
||||
} else {
|
||||
console.log('[Treez Client] No age gate detected');
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (err: any) {
|
||||
console.log(`[Treez Client] Age gate error: ${err.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NAVIGATION & SCRAPING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build menu URL for a store
|
||||
*/
|
||||
export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||
return `https://${storeId}.treez.io/onlinemenu/?customerType=${customerType}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigate to a store's menu page
|
||||
*/
|
||||
export async function navigateToMenu(storeId: string): Promise<void> {
|
||||
if (!currentSession) {
|
||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||
}
|
||||
|
||||
const { page } = currentSession;
|
||||
const url = buildMenuUrl(storeId);
|
||||
|
||||
console.log(`[Treez Client] Navigating to ${url}`);
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||
});
|
||||
|
||||
// Wait for React app to render
|
||||
await sleep(2000);
|
||||
|
||||
// Bypass age gate
|
||||
await bypassAgeGate(page);
|
||||
|
||||
// Wait for content to load
|
||||
await sleep(2000);
|
||||
|
||||
console.log('[Treez Client] Menu page loaded');
|
||||
}
|
||||
|
||||
/**
|
||||
* Scroll to load all products (infinite scroll)
|
||||
*/
|
||||
export async function scrollToLoadAll(page: Page): Promise<number> {
|
||||
let previousHeight = 0;
|
||||
let scrollCount = 0;
|
||||
let sameHeightCount = 0;
|
||||
|
||||
console.log('[Treez Client] Starting infinite scroll...');
|
||||
|
||||
while (scrollCount < TREEZ_CONFIG.maxScrollAttempts) {
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
|
||||
if (currentHeight === previousHeight) {
|
||||
sameHeightCount++;
|
||||
if (sameHeightCount >= 3) {
|
||||
console.log('[Treez Client] No new content after 3 attempts, stopping');
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
sameHeightCount = 0;
|
||||
}
|
||||
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await sleep(TREEZ_CONFIG.scrollDelay);
|
||||
|
||||
previousHeight = currentHeight;
|
||||
scrollCount++;
|
||||
|
||||
if (scrollCount % 5 === 0) {
|
||||
const productCount = await page.evaluate(() => {
|
||||
return document.querySelectorAll('[class*="product_product__"]').length;
|
||||
});
|
||||
console.log(`[Treez Client] Scroll ${scrollCount}: ${productCount} products loaded`);
|
||||
}
|
||||
}
|
||||
|
||||
return scrollCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from the current page
|
||||
*/
|
||||
export async function extractProducts(page: Page): Promise<TreezProductRaw[]> {
|
||||
console.log('[Treez Client] Extracting products from DOM...');
|
||||
|
||||
const products = await page.evaluate(() => {
|
||||
const results: any[] = [];
|
||||
|
||||
// Find all product cards
|
||||
const productElements = Array.from(
|
||||
document.querySelectorAll('[class*="product_product__"]')
|
||||
).filter(el => {
|
||||
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
||||
const hasPrice = el.querySelector('[class*="price"]');
|
||||
return hasName || hasPrice;
|
||||
});
|
||||
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const el of productElements) {
|
||||
try {
|
||||
// Get product name
|
||||
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
|
||||
if (!name || seen.has(name)) continue;
|
||||
seen.add(name);
|
||||
|
||||
// Get product ID from link
|
||||
const linkEl = el.querySelector('a[href*="/product/"]');
|
||||
let productId = '';
|
||||
if (linkEl) {
|
||||
const href = linkEl.getAttribute('href') || '';
|
||||
const match = href.match(/\/product\/([^\/\?]+)/);
|
||||
productId = match ? match[1] : '';
|
||||
}
|
||||
if (!productId) {
|
||||
productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`;
|
||||
}
|
||||
|
||||
// Get brand
|
||||
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
||||
const brand = brandEl?.textContent?.trim() || '';
|
||||
|
||||
// Get price
|
||||
const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]');
|
||||
const priceText = priceEl?.textContent || '';
|
||||
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
||||
|
||||
// Get image URL
|
||||
const imgEl = el.querySelector('img');
|
||||
let imageUrl = imgEl?.getAttribute('src') || null;
|
||||
if (imageUrl && imageUrl.includes('/_next/image')) {
|
||||
const urlMatch = imageUrl.match(/url=([^&]+)/);
|
||||
if (urlMatch) {
|
||||
imageUrl = decodeURIComponent(urlMatch[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Get text content for data extraction
|
||||
const text = el.textContent || '';
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
// Get THC/CBD
|
||||
const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) ||
|
||||
text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||
const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) ||
|
||||
text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||
const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null;
|
||||
const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
||||
|
||||
// Get weight from name
|
||||
const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i);
|
||||
const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null;
|
||||
|
||||
// Determine category from weight and name (not full text to avoid nav pollution)
|
||||
let category = '';
|
||||
|
||||
// Check explicit category patterns in NAME ONLY (not full text)
|
||||
// This avoids false positives from navigation elements
|
||||
const categoryPatterns = [
|
||||
{ pattern: /vape|cart(?:ridge)?|pen|pod/i, category: 'vape' },
|
||||
{ pattern: /edible|gummy|gummies|chocolate|candy/i, category: 'edible' },
|
||||
{ pattern: /concentrate|dab|wax|shatter|rosin|resin/i, category: 'concentrate' },
|
||||
{ pattern: /pre.?roll|joint|blunt/i, category: 'pre-roll' },
|
||||
{ pattern: /topical|balm|cream|lotion/i, category: 'topical' },
|
||||
{ pattern: /tincture/i, category: 'tincture' },
|
||||
];
|
||||
for (const { pattern, category: cat } of categoryPatterns) {
|
||||
if (pattern.test(name)) {
|
||||
category = cat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If no explicit category found, infer from weight
|
||||
if (!category && weight) {
|
||||
const weightLower = weight.toLowerCase();
|
||||
if (weightLower.includes('g') && !weightLower.includes('mg')) {
|
||||
// Gram weights (3.5g, 1g, 7g, etc.) are typically flower
|
||||
category = 'flower';
|
||||
} else if (weightLower.includes('mg')) {
|
||||
// Milligram weights are typically edibles
|
||||
category = 'edible';
|
||||
}
|
||||
}
|
||||
|
||||
// Get strain type
|
||||
const strainTypes = ['indica', 'sativa', 'hybrid'];
|
||||
let subcategory = '';
|
||||
for (const strain of strainTypes) {
|
||||
if (textLower.includes(strain)) {
|
||||
subcategory = strain;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check stock status
|
||||
const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out');
|
||||
|
||||
results.push({
|
||||
productId,
|
||||
name,
|
||||
brand,
|
||||
category,
|
||||
subcategory,
|
||||
thcPercent,
|
||||
cbdPercent,
|
||||
price,
|
||||
priceUnit: weight || '',
|
||||
imageUrl,
|
||||
inStock,
|
||||
weight,
|
||||
});
|
||||
} catch (err) {
|
||||
// Skip products that fail extraction
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
console.log(`[Treez Client] Extracted ${products.length} products`);
|
||||
return products;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all products from a store
|
||||
* Main entry point for product discovery
|
||||
*/
|
||||
export async function fetchAllProducts(storeId: string): Promise<{
|
||||
products: TreezProductRaw[];
|
||||
storeInfo: TreezStoreInfo;
|
||||
scrollCount: number;
|
||||
}> {
|
||||
if (!currentSession) {
|
||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||
}
|
||||
|
||||
const { page } = currentSession;
|
||||
|
||||
// Navigate to menu
|
||||
await navigateToMenu(storeId);
|
||||
|
||||
// Get page title for store info
|
||||
const pageTitle = await page.title();
|
||||
const storeInfo: TreezStoreInfo = {
|
||||
storeId,
|
||||
name: pageTitle.split('|')[1]?.trim() || pageTitle,
|
||||
url: buildMenuUrl(storeId),
|
||||
};
|
||||
|
||||
// Scroll to load all products
|
||||
const scrollCount = await scrollToLoadAll(page);
|
||||
|
||||
// Extract products
|
||||
const products = await extractProducts(page);
|
||||
|
||||
// Record success if we got products
|
||||
if (crawlRotator && products.length > 0) {
|
||||
await crawlRotator.recordSuccess();
|
||||
}
|
||||
|
||||
return { products, storeInfo, scrollCount };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// UTILITY
|
||||
// ============================================================
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
50
backend/src/platforms/treez/index.ts
Normal file
50
backend/src/platforms/treez/index.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
/**
|
||||
* Treez Platform Module
|
||||
*
|
||||
* Single export point for all Treez communication.
|
||||
* All Treez workers MUST import from this module.
|
||||
*/
|
||||
|
||||
export {
|
||||
// Session Management
|
||||
startSession,
|
||||
endSession,
|
||||
getCurrentSession,
|
||||
|
||||
// Proxy/Rotation
|
||||
setCrawlRotator,
|
||||
getCrawlRotator,
|
||||
|
||||
// Core Operations
|
||||
navigateToMenu,
|
||||
scrollToLoadAll,
|
||||
extractProducts,
|
||||
fetchAllProducts,
|
||||
bypassAgeGate,
|
||||
|
||||
// URL Building
|
||||
buildMenuUrl,
|
||||
|
||||
// Configuration
|
||||
TREEZ_CONFIG,
|
||||
|
||||
// Types
|
||||
type TreezSession,
|
||||
type TreezStoreInfo,
|
||||
type TreezProductRaw,
|
||||
} from './client';
|
||||
|
||||
// High-level Query Functions
|
||||
export {
|
||||
fetchProductsByStoreId,
|
||||
fetchProductsFromUrl,
|
||||
extractStoreIdFromUrl,
|
||||
validateStoreId,
|
||||
getMenuUrl,
|
||||
|
||||
// Types
|
||||
type FetchProductsResult,
|
||||
} from './queries';
|
||||
|
||||
// Re-export CrawlRotator types from canonical location
|
||||
export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator';
|
||||
132
backend/src/platforms/treez/queries.ts
Normal file
132
backend/src/platforms/treez/queries.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
/**
|
||||
* Treez High-Level Query Functions
|
||||
*
|
||||
* Wraps the low-level client methods with business logic
|
||||
* for common operations like product fetching.
|
||||
*/
|
||||
|
||||
import {
|
||||
startSession,
|
||||
endSession,
|
||||
fetchAllProducts,
|
||||
buildMenuUrl,
|
||||
TreezProductRaw,
|
||||
TreezStoreInfo,
|
||||
} from './client';
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
export interface FetchProductsResult {
|
||||
store: TreezStoreInfo;
|
||||
products: TreezProductRaw[];
|
||||
totalCaptured: number;
|
||||
scrollCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all products from a Treez store
|
||||
*
|
||||
* @param storeId - Treez store ID (slug like "best")
|
||||
* @returns Products and store data captured from the page
|
||||
*/
|
||||
export async function fetchProductsByStoreId(storeId: string): Promise<FetchProductsResult> {
|
||||
try {
|
||||
await startSession(storeId);
|
||||
|
||||
const { products, storeInfo, scrollCount } = await fetchAllProducts(storeId);
|
||||
|
||||
return {
|
||||
store: storeInfo,
|
||||
products,
|
||||
totalCaptured: products.length,
|
||||
scrollCount,
|
||||
};
|
||||
} finally {
|
||||
await endSession();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch products from a Treez menu URL
|
||||
* Extracts store ID from URL and fetches products
|
||||
*
|
||||
* @param menuUrl - Full Treez menu URL
|
||||
* @returns Products and store data
|
||||
*/
|
||||
export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProductsResult> {
|
||||
const storeId = extractStoreIdFromUrl(menuUrl);
|
||||
if (!storeId) {
|
||||
throw new Error(`Could not extract store ID from URL: ${menuUrl}`);
|
||||
}
|
||||
|
||||
return fetchProductsByStoreId(storeId);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STORE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Extract store ID from a Treez URL
|
||||
*
|
||||
* Supports formats:
|
||||
* - https://best.treez.io/onlinemenu/
|
||||
* - https://shop.bestdispensary.com/ (resolves to best.treez.io)
|
||||
*
|
||||
* @param url - Treez menu URL
|
||||
* @returns Store ID or null if not found
|
||||
*/
|
||||
export function extractStoreIdFromUrl(url: string): string | null {
|
||||
// Pattern 1: {storeId}.treez.io
|
||||
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/i);
|
||||
if (treezMatch) {
|
||||
return treezMatch[1];
|
||||
}
|
||||
|
||||
// Pattern 2: Custom domain - would need to follow redirect
|
||||
// For now, return null and let the caller handle it
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that a store ID exists and is accessible
|
||||
*
|
||||
* @param storeId - Treez store ID
|
||||
* @returns True if store is accessible
|
||||
*/
|
||||
export async function validateStoreId(storeId: string): Promise<boolean> {
|
||||
try {
|
||||
await startSession(storeId);
|
||||
|
||||
const { page } = (await import('./client')).getCurrentSession()!;
|
||||
const url = buildMenuUrl(storeId);
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Check if we got a valid page (not 404)
|
||||
const title = await page.title();
|
||||
const is404 = title.toLowerCase().includes('404') || title.toLowerCase().includes('not found');
|
||||
|
||||
return !is404;
|
||||
} catch {
|
||||
return false;
|
||||
} finally {
|
||||
await endSession();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// UTILITY FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Get the direct Treez menu URL for a store
|
||||
*/
|
||||
export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||
return buildMenuUrl(storeId, customerType);
|
||||
}
|
||||
@@ -27,3 +27,6 @@ export { handleStoreDiscoveryDutchie } from './store-discovery-dutchie';
|
||||
export { handleStoreDiscoveryJane } from './store-discovery-jane';
|
||||
export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane';
|
||||
export { handleProductDiscoveryJane } from './product-discovery-jane';
|
||||
|
||||
// Treez Platform Handlers
|
||||
export { handleProductDiscoveryTreez } from './product-discovery-treez';
|
||||
|
||||
@@ -126,6 +126,28 @@ export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise<T
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STEP 2b: Apply stored fingerprint (timezone, locale)
|
||||
// CRITICAL: Must match the IP's geographic location
|
||||
// ============================================================
|
||||
if (ctx.fingerprint?.timezone) {
|
||||
try {
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Emulation.setTimezoneOverride', { timezoneId: ctx.fingerprint.timezone });
|
||||
console.log(`[ProductDiscoveryHTTP] Browser timezone set to: ${ctx.fingerprint.timezone}`);
|
||||
} catch (tzErr: any) {
|
||||
console.warn(`[ProductDiscoveryHTTP] Failed to set timezone: ${tzErr.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Set locale to match proxy region (en-US for US proxies)
|
||||
if (ctx.fingerprint?.locale) {
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept-Language': `${ctx.fingerprint.locale},en;q=0.9`,
|
||||
});
|
||||
console.log(`[ProductDiscoveryHTTP] Accept-Language set to: ${ctx.fingerprint.locale}`);
|
||||
}
|
||||
|
||||
await ctx.heartbeat();
|
||||
|
||||
// ============================================================
|
||||
|
||||
172
backend/src/tasks/handlers/product-discovery-treez.ts
Normal file
172
backend/src/tasks/handlers/product-discovery-treez.ts
Normal file
@@ -0,0 +1,172 @@
|
||||
/**
|
||||
* Treez Product Discovery Handler
|
||||
*
|
||||
* Fetches all products from a Treez store via Puppeteer + DOM scraping.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Load dispensary with platform_dispensary_id (store slug)
|
||||
* 2. Navigate to menu URL, bypass age gate
|
||||
* 3. Scroll to load all products (infinite scroll)
|
||||
* 4. Extract products from DOM
|
||||
* 5. Save raw payload to filesystem
|
||||
* 6. Queue product_refresh task for normalization
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import {
|
||||
setCrawlRotator,
|
||||
fetchProductsByStoreId,
|
||||
} from '../../platforms/treez';
|
||||
import { saveRawPayload } from '../../utils/payload-storage';
|
||||
import { taskService } from '../task-service';
|
||||
|
||||
export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, task, crawlRotator } = ctx;
|
||||
const dispensaryId = task.dispensary_id;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return {
|
||||
success: false,
|
||||
error: 'Missing dispensary_id in task',
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[TreezProductDiscovery] Starting for dispensary ${dispensaryId}`);
|
||||
|
||||
try {
|
||||
// Load dispensary
|
||||
const dispResult = await pool.query(
|
||||
`SELECT id, name, menu_url, platform_dispensary_id, menu_type, platform
|
||||
FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (dispResult.rows.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Dispensary ${dispensaryId} not found`,
|
||||
};
|
||||
}
|
||||
|
||||
const dispensary = dispResult.rows[0];
|
||||
|
||||
if (!dispensary.platform_dispensary_id) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Dispensary ${dispensaryId} has no platform_dispensary_id (Treez store ID)`,
|
||||
};
|
||||
}
|
||||
|
||||
const storeId = dispensary.platform_dispensary_id;
|
||||
console.log(`[TreezProductDiscovery] Fetching products for Treez store "${storeId}"`);
|
||||
|
||||
// Attach crawl rotator
|
||||
if (crawlRotator) {
|
||||
setCrawlRotator(crawlRotator);
|
||||
}
|
||||
|
||||
// Fetch products via DOM scraping
|
||||
const result = await fetchProductsByStoreId(storeId);
|
||||
|
||||
if (result.products.length === 0) {
|
||||
console.warn(`[TreezProductDiscovery] No products captured for dispensary ${dispensaryId}`);
|
||||
|
||||
// Update dispensary with failure
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET consecutive_failures = consecutive_failures + 1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: 'No products captured from Treez menu page',
|
||||
productCount: 0,
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`);
|
||||
|
||||
// Build payload for storage
|
||||
const rawPayload = {
|
||||
products: result.products, // Store the scraped product data
|
||||
store: {
|
||||
storeId: result.store.storeId,
|
||||
name: result.store.name,
|
||||
url: result.store.url,
|
||||
},
|
||||
capturedAt: new Date().toISOString(),
|
||||
platform: 'treez',
|
||||
dispensaryId,
|
||||
scrollCount: result.scrollCount,
|
||||
};
|
||||
|
||||
// Save raw payload to filesystem (platform = 'treez')
|
||||
const { id: payloadId, sizeBytes } = await saveRawPayload(
|
||||
pool,
|
||||
dispensaryId,
|
||||
rawPayload,
|
||||
null, // crawl_run_id
|
||||
result.products.length,
|
||||
'treez' // platform
|
||||
);
|
||||
|
||||
console.log(`[TreezProductDiscovery] Saved payload ${payloadId} (${Math.round(sizeBytes / 1024)}KB)`);
|
||||
|
||||
// Update dispensary stage and timestamps
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET stage = 'hydrating',
|
||||
last_fetch_at = NOW(),
|
||||
product_count = $2,
|
||||
consecutive_successes = consecutive_successes + 1,
|
||||
consecutive_failures = 0,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[dispensaryId, result.products.length]
|
||||
);
|
||||
|
||||
// Queue product_refresh task for normalization
|
||||
console.log(`[TreezProductDiscovery] Queuing product_refresh for payload ${payloadId}`);
|
||||
await taskService.createTask({
|
||||
role: 'product_refresh',
|
||||
dispensary_id: dispensaryId,
|
||||
platform: 'treez',
|
||||
priority: task.priority || 0,
|
||||
payload: { payload_id: payloadId },
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
productCount: result.products.length,
|
||||
payloadId,
|
||||
payloadSizeKB: Math.round(sizeBytes / 1024),
|
||||
storeInfo: {
|
||||
storeId: result.store.storeId,
|
||||
name: result.store.name,
|
||||
},
|
||||
scrollCount: result.scrollCount,
|
||||
queuedProductRefresh: true,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.error(`[TreezProductDiscovery] Error:`, errorMessage);
|
||||
|
||||
// Update dispensary with failure
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET consecutive_failures = consecutive_failures + 1,
|
||||
stage = CASE WHEN consecutive_failures >= 2 THEN 'failing' ELSE stage END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
).catch(() => {});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: errorMessage,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -24,6 +24,7 @@
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { DutchieNormalizer } from '../../hydration/normalizers/dutchie';
|
||||
import { JaneNormalizer } from '../../hydration/normalizers/jane';
|
||||
import { TreezNormalizer } from '../../hydration/normalizers/treez';
|
||||
import { BaseNormalizer } from '../../hydration/normalizers/base';
|
||||
import {
|
||||
upsertStoreProducts,
|
||||
@@ -37,6 +38,7 @@ import { taskService } from '../task-service';
|
||||
const NORMALIZERS: Record<string, BaseNormalizer> = {
|
||||
dutchie: new DutchieNormalizer(),
|
||||
jane: new JaneNormalizer(),
|
||||
treez: new TreezNormalizer(),
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -87,6 +87,9 @@ import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
|
||||
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
|
||||
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
|
||||
|
||||
// Treez Platform Handlers
|
||||
import { handleProductDiscoveryTreez } from './handlers/product-discovery-treez';
|
||||
|
||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
||||
@@ -136,6 +139,14 @@ const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0
|
||||
// How long to wait (ms) when in backoff state before rechecking resources
|
||||
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
|
||||
|
||||
export interface WorkerFingerprint {
|
||||
timezone?: string;
|
||||
city?: string;
|
||||
state?: string;
|
||||
ip?: string;
|
||||
locale?: string;
|
||||
}
|
||||
|
||||
export interface TaskContext {
|
||||
pool: Pool;
|
||||
workerId: string;
|
||||
@@ -144,6 +155,8 @@ export interface TaskContext {
|
||||
crawlRotator?: CrawlRotator;
|
||||
/** Update the current step being executed (shown in dashboard) */
|
||||
updateStep: (step: string, detail?: string) => void;
|
||||
/** Worker's stored fingerprint from preflight (timezone, locale, etc.) */
|
||||
fingerprint?: WorkerFingerprint;
|
||||
}
|
||||
|
||||
export interface TaskResult {
|
||||
@@ -201,6 +214,17 @@ function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
|
||||
}
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// TREEZ PLATFORM ROUTING
|
||||
// ==========================================================================
|
||||
if (platform === 'treez') {
|
||||
if (role === 'product_discovery') {
|
||||
console.log(`[TaskWorker] Using Treez handler for product_discovery`);
|
||||
return handleProductDiscoveryTreez;
|
||||
}
|
||||
// Treez uses shared product_refresh handler via normalizer registry
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// DUTCHIE PLATFORM ROUTING (default)
|
||||
// ==========================================================================
|
||||
@@ -330,6 +354,8 @@ export class TaskWorker {
|
||||
private geoCity: string | null = null;
|
||||
private geoProxyUrl: string | null = null;
|
||||
private geoSessionStartedAt: Date | null = null;
|
||||
private storedTimezone: string | null = null;
|
||||
private storedFingerprint: WorkerFingerprint | null = null;
|
||||
|
||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||
this.pool = getPool();
|
||||
@@ -655,7 +681,22 @@ export class TaskWorker {
|
||||
|
||||
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
|
||||
if (this.preflightHttpResult?.proxyIp) {
|
||||
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`);
|
||||
const detectedTimezone = (this.preflightHttpResult as any).detectedTimezone;
|
||||
const detectedLocation = (this.preflightHttpResult as any).detectedLocation;
|
||||
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${detectedTimezone || 'unknown'}`);
|
||||
|
||||
// Store fingerprint for task execution - CRITICAL for anti-detect consistency
|
||||
if (this.preflightHttpPassed) {
|
||||
this.storedTimezone = detectedTimezone || null;
|
||||
this.storedFingerprint = {
|
||||
timezone: detectedTimezone,
|
||||
city: detectedLocation?.city,
|
||||
state: detectedLocation?.region,
|
||||
ip: this.preflightHttpResult.proxyIp,
|
||||
locale: 'en-US', // US proxies use English
|
||||
};
|
||||
console.log(`[TaskWorker] Stored fingerprint: ${JSON.stringify(this.storedFingerprint)}`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
// Non-fatal - worker can still function
|
||||
@@ -1349,7 +1390,7 @@ export class TaskWorker {
|
||||
throw new Error(`No handler registered for role: ${task.role}`);
|
||||
}
|
||||
|
||||
// Create context with step tracking
|
||||
// Create context with step tracking and fingerprint
|
||||
const ctx: TaskContext = {
|
||||
pool: this.pool,
|
||||
workerId: this.workerId,
|
||||
@@ -1361,6 +1402,8 @@ export class TaskWorker {
|
||||
updateStep: (step: string, detail?: string) => {
|
||||
this.updateTaskStep(task.id, step, detail);
|
||||
},
|
||||
// Pass stored fingerprint for browser configuration
|
||||
fingerprint: this.storedFingerprint || undefined,
|
||||
};
|
||||
|
||||
// Initialize step tracking for this task
|
||||
|
||||
Reference in New Issue
Block a user