fix(preflight): Apply stored fingerprint to task browser
- Add WorkerFingerprint interface with timezone, city, state, ip, locale - Store fingerprint in TaskWorker after preflight passes - Pass fingerprint through TaskContext to handlers - Apply timezone via CDP and locale via Accept-Language header - Ensures browser fingerprint matches proxy IP location This fixes anti-detect detection where timezone/locale mismatch with proxy IP was getting blocked by Cloudflare. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
130
backend/scripts/count-jane-stores-v2.ts
Normal file
130
backend/scripts/count-jane-stores-v2.ts
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
/**
|
||||||
|
* Count Jane stores - v2: Try Algolia store search
|
||||||
|
* Usage: npx ts-node scripts/count-jane-stores-v2.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
const STATES = [
|
||||||
|
'AZ', 'CA', 'CO', 'FL', 'IL', 'MA', 'MI', 'NV', 'NJ', 'NY', 'OH', 'PA', 'WA', 'OR'
|
||||||
|
];
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Counting Jane stores by exploring state pages...\n');
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
const allStores: Map<number, any> = new Map();
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
const type = req.resourceType();
|
||||||
|
if (['image', 'font', 'media', 'stylesheet'].includes(type)) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const url = response.url();
|
||||||
|
const contentType = response.headers()['content-type'] || '';
|
||||||
|
if (url.includes('iheartjane.com') && contentType.includes('json')) {
|
||||||
|
try {
|
||||||
|
const json = await response.json();
|
||||||
|
// Look for stores in any response
|
||||||
|
if (json.stores && Array.isArray(json.stores)) {
|
||||||
|
for (const s of json.stores) {
|
||||||
|
if (s.id) allStores.set(s.id, s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Also check hits (Algolia format)
|
||||||
|
if (json.hits && Array.isArray(json.hits)) {
|
||||||
|
for (const s of json.hits) {
|
||||||
|
if (s.id) allStores.set(s.id, s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// First visit the main stores page
|
||||||
|
console.log('Visiting main stores page...');
|
||||||
|
await page.goto('https://www.iheartjane.com/stores', {
|
||||||
|
waitUntil: 'networkidle0',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
// Try to scroll to load more stores
|
||||||
|
console.log('Scrolling to load more...');
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
await page.evaluate(() => window.scrollBy(0, 1000));
|
||||||
|
await new Promise(r => setTimeout(r, 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try clicking "Load More" if it exists
|
||||||
|
try {
|
||||||
|
const loadMore = await page.$('button:has-text("Load More"), [class*="load-more"]');
|
||||||
|
if (loadMore) {
|
||||||
|
console.log('Clicking Load More...');
|
||||||
|
await loadMore.click();
|
||||||
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
// Extract stores from DOM as fallback
|
||||||
|
const domStores = await page.evaluate(() => {
|
||||||
|
const storeElements = document.querySelectorAll('[data-store-id], [class*="StoreCard"], [class*="store-card"]');
|
||||||
|
return storeElements.length;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`\nStores from DOM elements: ${domStores}`);
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
// Count by state
|
||||||
|
const byState: Record<string, number> = {};
|
||||||
|
for (const store of allStores.values()) {
|
||||||
|
const state = store.state || 'Unknown';
|
||||||
|
byState[state] = (byState[state] || 0) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n=== JANE STORE COUNTS ===\n');
|
||||||
|
console.log(`Unique stores captured: ${allStores.size}`);
|
||||||
|
|
||||||
|
if (allStores.size > 0) {
|
||||||
|
console.log('\nBy State:');
|
||||||
|
const sorted = Object.entries(byState).sort((a, b) => b[1] - a[1]);
|
||||||
|
for (const [state, count] of sorted.slice(0, 20)) {
|
||||||
|
console.log(` ${state}: ${count}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check Arizona specifically
|
||||||
|
const azStores = Array.from(allStores.values()).filter(s =>
|
||||||
|
s.state === 'Arizona' || s.state === 'AZ'
|
||||||
|
);
|
||||||
|
console.log(`\nArizona stores: ${azStores.length}`);
|
||||||
|
if (azStores.length > 0) {
|
||||||
|
console.log('AZ stores:');
|
||||||
|
for (const s of azStores.slice(0, 10)) {
|
||||||
|
console.log(` - ${s.name} (ID: ${s.id}) - ${s.city}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note about total
|
||||||
|
console.log('\n--- Note ---');
|
||||||
|
console.log('Jane uses server-side rendering. To get full store count,');
|
||||||
|
console.log('you may need to check their public marketing materials or');
|
||||||
|
console.log('iterate through known store IDs.');
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
98
backend/scripts/count-jane-stores.ts
Normal file
98
backend/scripts/count-jane-stores.ts
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
/**
|
||||||
|
* Count Jane stores by state
|
||||||
|
* Usage: npx ts-node scripts/count-jane-stores.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Counting Jane stores...\n');
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// Capture store data from API
|
||||||
|
const stores: any[] = [];
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
const type = req.resourceType();
|
||||||
|
if (['image', 'font', 'media', 'stylesheet'].includes(type)) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const url = response.url();
|
||||||
|
if (url.includes('iheartjane.com') && url.includes('stores')) {
|
||||||
|
try {
|
||||||
|
const json = await response.json();
|
||||||
|
if (json.stores && Array.isArray(json.stores)) {
|
||||||
|
stores.push(...json.stores);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Visit the store directory
|
||||||
|
console.log('Loading Jane store directory...');
|
||||||
|
await page.goto('https://www.iheartjane.com/stores', {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for stores to load
|
||||||
|
await new Promise(r => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
// Also try to get store count from page content
|
||||||
|
const pageStoreCount = await page.evaluate(() => {
|
||||||
|
// Look for store count in page text
|
||||||
|
const text = document.body.innerText;
|
||||||
|
const match = text.match(/(\d+)\s*stores?/i);
|
||||||
|
return match ? parseInt(match[1]) : null;
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
// Count by state
|
||||||
|
const byState: Record<string, number> = {};
|
||||||
|
for (const store of stores) {
|
||||||
|
const state = store.state || 'Unknown';
|
||||||
|
byState[state] = (byState[state] || 0) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n=== JANE STORE COUNTS ===\n');
|
||||||
|
console.log(`Total stores captured from API: ${stores.length}`);
|
||||||
|
if (pageStoreCount) {
|
||||||
|
console.log(`Page claims: ${pageStoreCount} stores`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\nBy State:');
|
||||||
|
const sorted = Object.entries(byState).sort((a, b) => b[1] - a[1]);
|
||||||
|
for (const [state, count] of sorted) {
|
||||||
|
console.log(` ${state}: ${count}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check Arizona specifically
|
||||||
|
const azStores = stores.filter(s =>
|
||||||
|
s.state === 'Arizona' || s.state === 'AZ'
|
||||||
|
);
|
||||||
|
console.log(`\nArizona stores: ${azStores.length}`);
|
||||||
|
if (azStores.length > 0) {
|
||||||
|
console.log('Sample AZ stores:');
|
||||||
|
for (const s of azStores.slice(0, 5)) {
|
||||||
|
console.log(` - ${s.name} (ID: ${s.id}) - ${s.city}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
247
backend/scripts/explore-treez-structure.ts
Normal file
247
backend/scripts/explore-treez-structure.ts
Normal file
@@ -0,0 +1,247 @@
|
|||||||
|
/**
|
||||||
|
* Explore Treez site structure to find full product catalog
|
||||||
|
*
|
||||||
|
* Usage: npx ts-node scripts/explore-treez-structure.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer from 'puppeteer';
|
||||||
|
|
||||||
|
const STORE_ID = 'best';
|
||||||
|
|
||||||
|
async function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('Exploring Treez Site Structure');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Navigate to base menu URL
|
||||||
|
const baseUrl = `https://${STORE_ID}.treez.io/onlinemenu/?customerType=ADULT`;
|
||||||
|
console.log(`\n[1] Navigating to: ${baseUrl}`);
|
||||||
|
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
// Bypass age gate if present
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||||
|
if (ageGate) {
|
||||||
|
console.log('[1] Age gate detected, bypassing...');
|
||||||
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||||
|
if (btn) await btn.click();
|
||||||
|
await sleep(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all navigation links
|
||||||
|
console.log('\n[2] Extracting navigation structure...');
|
||||||
|
const navInfo = await page.evaluate(() => {
|
||||||
|
const links: { text: string; href: string }[] = [];
|
||||||
|
|
||||||
|
// Look for nav links
|
||||||
|
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a, header a').forEach(el => {
|
||||||
|
const text = el.textContent?.trim() || '';
|
||||||
|
const href = el.getAttribute('href') || '';
|
||||||
|
if (text && href && !links.some(l => l.href === href)) {
|
||||||
|
links.push({ text, href });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Look for category tabs/buttons
|
||||||
|
document.querySelectorAll('[class*="category"], [class*="tab"], [role="tab"]').forEach(el => {
|
||||||
|
const text = el.textContent?.trim() || '';
|
||||||
|
const href = el.getAttribute('href') || el.getAttribute('data-href') || '';
|
||||||
|
if (text && !links.some(l => l.text === text)) {
|
||||||
|
links.push({ text, href: href || `(click: ${el.className})` });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Get current URL
|
||||||
|
const currentUrl = window.location.href;
|
||||||
|
|
||||||
|
// Count products on page
|
||||||
|
const productCount = document.querySelectorAll('[class*="product_product__"]').length;
|
||||||
|
|
||||||
|
return { links, currentUrl, productCount };
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`Current URL: ${navInfo.currentUrl}`);
|
||||||
|
console.log(`Products on homepage: ${navInfo.productCount}`);
|
||||||
|
console.log('\nNavigation links found:');
|
||||||
|
navInfo.links.forEach(l => {
|
||||||
|
console.log(` "${l.text}" → ${l.href}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Look for category buttons/tabs specifically
|
||||||
|
console.log('\n[3] Looking for category navigation...');
|
||||||
|
const categories = await page.evaluate(() => {
|
||||||
|
const cats: { text: string; className: string; tagName: string }[] = [];
|
||||||
|
|
||||||
|
// Find all clickable elements that might be categories
|
||||||
|
const selectors = [
|
||||||
|
'[class*="CategoryNav"]',
|
||||||
|
'[class*="category"]',
|
||||||
|
'[class*="Category"]',
|
||||||
|
'[class*="nav"] button',
|
||||||
|
'[class*="tab"]',
|
||||||
|
'[role="tablist"] *',
|
||||||
|
'.MuiTab-root',
|
||||||
|
'[class*="filter"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
const text = el.textContent?.trim() || '';
|
||||||
|
if (text && text.length < 50 && !cats.some(c => c.text === text)) {
|
||||||
|
cats.push({
|
||||||
|
text,
|
||||||
|
className: el.className?.toString().slice(0, 80) || '',
|
||||||
|
tagName: el.tagName,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return cats;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Category-like elements:');
|
||||||
|
categories.forEach(c => {
|
||||||
|
console.log(` [${c.tagName}] "${c.text}" (class: ${c.className})`);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Try clicking on "Flower" or "All" if found
|
||||||
|
console.log('\n[4] Looking for "Flower" or "All Products" link...');
|
||||||
|
const clickTargets = ['Flower', 'All', 'All Products', 'Shop All', 'View All'];
|
||||||
|
|
||||||
|
for (const target of clickTargets) {
|
||||||
|
const element = await page.evaluate((targetText) => {
|
||||||
|
const els = Array.from(document.querySelectorAll('a, button, [role="tab"], [class*="category"]'));
|
||||||
|
const match = els.find(el =>
|
||||||
|
el.textContent?.trim().toLowerCase() === targetText.toLowerCase()
|
||||||
|
);
|
||||||
|
if (match) {
|
||||||
|
return {
|
||||||
|
found: true,
|
||||||
|
text: match.textContent?.trim(),
|
||||||
|
tag: match.tagName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { found: false };
|
||||||
|
}, target);
|
||||||
|
|
||||||
|
if (element.found) {
|
||||||
|
console.log(`Found "${element.text}" (${element.tag}), clicking...`);
|
||||||
|
|
||||||
|
await page.evaluate((targetText) => {
|
||||||
|
const els = Array.from(document.querySelectorAll('a, button, [role="tab"], [class*="category"]'));
|
||||||
|
const match = els.find(el =>
|
||||||
|
el.textContent?.trim().toLowerCase() === targetText.toLowerCase()
|
||||||
|
);
|
||||||
|
if (match) (match as HTMLElement).click();
|
||||||
|
}, target);
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
const newUrl = page.url();
|
||||||
|
const newCount = await page.evaluate(() =>
|
||||||
|
document.querySelectorAll('[class*="product_product__"]').length
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` New URL: ${newUrl}`);
|
||||||
|
console.log(` Products after click: ${newCount}`);
|
||||||
|
|
||||||
|
if (newCount > navInfo.productCount) {
|
||||||
|
console.log(` ✓ Found more products! (${navInfo.productCount} → ${newCount})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check page height and scroll behavior
|
||||||
|
console.log('\n[5] Checking scroll behavior on current page...');
|
||||||
|
let previousHeight = 0;
|
||||||
|
let scrollCount = 0;
|
||||||
|
let previousProductCount = await page.evaluate(() =>
|
||||||
|
document.querySelectorAll('[class*="product_product__"]').length
|
||||||
|
);
|
||||||
|
|
||||||
|
while (scrollCount < 10) {
|
||||||
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
console.log(` Scroll ${scrollCount + 1}: No height change, stopping`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
|
await sleep(1500);
|
||||||
|
|
||||||
|
const currentProductCount = await page.evaluate(() =>
|
||||||
|
document.querySelectorAll('[class*="product_product__"]').length
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` Scroll ${scrollCount + 1}: height=${currentHeight}, products=${currentProductCount}`);
|
||||||
|
|
||||||
|
if (currentProductCount === previousProductCount && scrollCount > 2) {
|
||||||
|
console.log(' No new products loading, stopping');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
previousProductCount = currentProductCount;
|
||||||
|
scrollCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try direct URL patterns
|
||||||
|
console.log('\n[6] Testing URL patterns...');
|
||||||
|
const urlPatterns = [
|
||||||
|
'/onlinemenu/flower?customerType=ADULT',
|
||||||
|
'/onlinemenu/all?customerType=ADULT',
|
||||||
|
'/onlinemenu?category=flower&customerType=ADULT',
|
||||||
|
'/onlinemenu?view=all&customerType=ADULT',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const pattern of urlPatterns) {
|
||||||
|
const testUrl = `https://${STORE_ID}.treez.io${pattern}`;
|
||||||
|
console.log(`\nTrying: ${testUrl}`);
|
||||||
|
|
||||||
|
await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
// Bypass age gate again if needed
|
||||||
|
const gate = await page.$('[data-testid="age-gate-modal"]');
|
||||||
|
if (gate) {
|
||||||
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||||
|
if (btn) await btn.click();
|
||||||
|
await sleep(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
const productCount = await page.evaluate(() =>
|
||||||
|
document.querySelectorAll('[class*="product_product__"]').length
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` Products found: ${productCount}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Screenshot the final state
|
||||||
|
await page.screenshot({ path: '/tmp/treez-explore.png', fullPage: true });
|
||||||
|
console.log('\n[7] Screenshot saved to /tmp/treez-explore.png');
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error:', error.message);
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
188
backend/scripts/test-iheartjane.ts
Normal file
188
backend/scripts/test-iheartjane.ts
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
/**
|
||||||
|
* One-off script to test iHeartJane scraping
|
||||||
|
* Mimics remote worker: Puppeteer + stealth + proxy
|
||||||
|
*
|
||||||
|
* Usage: npx ts-node scripts/test-iheartjane.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
const TARGET_URL = 'https://theflowershopusa.com/mesa/menu/';
|
||||||
|
const STORE_ID = 2788;
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('[iHeartJane Test] Starting...');
|
||||||
|
|
||||||
|
// No proxy for local testing
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
|
// Intercept network requests to capture API calls
|
||||||
|
const apiResponses: any[] = [];
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
// Block heavy resources
|
||||||
|
const type = req.resourceType();
|
||||||
|
if (['image', 'font', 'media', 'stylesheet'].includes(type)) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const url = response.url();
|
||||||
|
const contentType = response.headers()['content-type'] || '';
|
||||||
|
|
||||||
|
// Capture any JSON response from iheartjane domains
|
||||||
|
if ((url.includes('iheartjane.com') || url.includes('algolia')) && contentType.includes('json')) {
|
||||||
|
try {
|
||||||
|
const json = await response.json();
|
||||||
|
const type = url.includes('store') ? 'STORE' :
|
||||||
|
url.includes('product') ? 'PRODUCT' :
|
||||||
|
url.includes('algolia') ? 'ALGOLIA' : 'API';
|
||||||
|
apiResponses.push({ type, url, data: json });
|
||||||
|
console.log(`[${type}] ${url.substring(0, 120)}...`);
|
||||||
|
} catch {
|
||||||
|
// Not JSON
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[iHeartJane Test] Navigating to ${TARGET_URL}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(TARGET_URL, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[iHeartJane Test] Menu page loaded, waiting for data...');
|
||||||
|
|
||||||
|
// Wait a bit for all API calls to complete
|
||||||
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
// Also try to get store info by visiting the store page
|
||||||
|
console.log('[iHeartJane Test] Fetching store info...');
|
||||||
|
const storeInfoUrl = `https://api.iheartjane.com/v1/stores/${STORE_ID}`;
|
||||||
|
|
||||||
|
// Try to fetch store info via page.evaluate (uses browser context)
|
||||||
|
const storeInfo = await page.evaluate(async (storeId) => {
|
||||||
|
try {
|
||||||
|
const resp = await fetch(`https://api.iheartjane.com/v1/stores/${storeId}`);
|
||||||
|
if (resp.ok) return await resp.json();
|
||||||
|
return { error: resp.status };
|
||||||
|
} catch (e: any) {
|
||||||
|
return { error: e.message };
|
||||||
|
}
|
||||||
|
}, STORE_ID);
|
||||||
|
|
||||||
|
if (storeInfo && !storeInfo.error) {
|
||||||
|
apiResponses.push({ type: 'STORE_DIRECT', url: storeInfoUrl, data: storeInfo });
|
||||||
|
console.log('[STORE_DIRECT] Got store info via fetch');
|
||||||
|
} else {
|
||||||
|
console.log(`[STORE_DIRECT] Failed: ${JSON.stringify(storeInfo)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[iHeartJane Test] Processing results...');
|
||||||
|
|
||||||
|
// Wait for products to load
|
||||||
|
await page.waitForSelector('[data-testid="product-card"], .product-card, [class*="ProductCard"]', {
|
||||||
|
timeout: 30000,
|
||||||
|
}).catch(() => console.log('[iHeartJane Test] No product cards found via selector'));
|
||||||
|
|
||||||
|
// Try to extract product data from the page
|
||||||
|
const products = await page.evaluate(() => {
|
||||||
|
// Look for product data in various places
|
||||||
|
const results: any[] = [];
|
||||||
|
|
||||||
|
// Method 1: Look for __INITIAL_STATE__ or similar
|
||||||
|
const scripts = Array.from(document.querySelectorAll('script'));
|
||||||
|
for (const script of scripts) {
|
||||||
|
const text = script.textContent || '';
|
||||||
|
if (text.includes('products') && text.includes('price')) {
|
||||||
|
try {
|
||||||
|
// Try to find JSON object
|
||||||
|
const match = text.match(/\{[\s\S]*"products"[\s\S]*\}/);
|
||||||
|
if (match) {
|
||||||
|
results.push({ source: 'script', data: match[0].substring(0, 500) });
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Method 2: Look for product elements in DOM
|
||||||
|
const productElements = document.querySelectorAll('[data-testid="product-card"], .product-card, [class*="product"]');
|
||||||
|
for (const el of Array.from(productElements).slice(0, 5)) {
|
||||||
|
const name = el.querySelector('[class*="name"], h3, h4')?.textContent;
|
||||||
|
const price = el.querySelector('[class*="price"]')?.textContent;
|
||||||
|
if (name) {
|
||||||
|
results.push({ source: 'dom', name, price });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\n[iHeartJane Test] === RESULTS ===');
|
||||||
|
console.log(`Total API responses captured: ${apiResponses.length}`);
|
||||||
|
|
||||||
|
// Group by type
|
||||||
|
const byType: Record<string, any[]> = {};
|
||||||
|
for (const r of apiResponses) {
|
||||||
|
byType[r.type] = byType[r.type] || [];
|
||||||
|
byType[r.type].push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [type, items] of Object.entries(byType)) {
|
||||||
|
console.log(`\n--- ${type} (${items.length} responses) ---`);
|
||||||
|
for (const item of items) {
|
||||||
|
console.log(`URL: ${item.url}`);
|
||||||
|
// Show structure
|
||||||
|
if (item.data.hits) {
|
||||||
|
console.log(` Products: ${item.data.hits.length} hits`);
|
||||||
|
if (item.data.hits[0]) {
|
||||||
|
console.log(` Fields: ${Object.keys(item.data.hits[0]).join(', ')}`);
|
||||||
|
}
|
||||||
|
} else if (item.data.store) {
|
||||||
|
console.log(` Store: ${JSON.stringify(item.data.store, null, 2).substring(0, 1000)}`);
|
||||||
|
} else {
|
||||||
|
console.log(` Keys: ${Object.keys(item.data).join(', ')}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write full data to file
|
||||||
|
const fs = await import('fs');
|
||||||
|
fs.writeFileSync('/tmp/iheartjane-data.json', JSON.stringify(apiResponses, null, 2));
|
||||||
|
console.log('\n[iHeartJane Test] Full data saved to /tmp/iheartjane-data.json');
|
||||||
|
|
||||||
|
// Take screenshot
|
||||||
|
await page.screenshot({ path: '/tmp/iheartjane-test.png', fullPage: false });
|
||||||
|
console.log('[iHeartJane Test] Screenshot saved to /tmp/iheartjane-test.png');
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[iHeartJane Test] Error:', error.message);
|
||||||
|
await page.screenshot({ path: '/tmp/iheartjane-error.png' });
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[iHeartJane Test] Done');
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
224
backend/scripts/test-jane-api-explore.ts
Normal file
224
backend/scripts/test-jane-api-explore.ts
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
/**
|
||||||
|
* Explore Jane API to understand data structure
|
||||||
|
* Usage: npx ts-node scripts/test-jane-api-explore.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Exploring Jane API from browser context...\n');
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// Intercept network requests to find store data API calls
|
||||||
|
const capturedResponses: Array<{ url: string; data: any }> = [];
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => req.continue());
|
||||||
|
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const url = response.url();
|
||||||
|
if (url.includes('iheartjane.com') &&
|
||||||
|
(url.includes('/stores') || url.includes('/search') || url.includes('algolia'))) {
|
||||||
|
try {
|
||||||
|
const text = await response.text();
|
||||||
|
if (text.startsWith('{') || text.startsWith('[')) {
|
||||||
|
const data = JSON.parse(text);
|
||||||
|
capturedResponses.push({ url, data });
|
||||||
|
console.log(`Captured: ${url.substring(0, 100)}...`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Not JSON
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Visit Jane to establish session
|
||||||
|
console.log('Visiting Jane stores page to capture network requests...');
|
||||||
|
await page.goto('https://www.iheartjane.com/stores', {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`\nCaptured ${capturedResponses.length} API responses`);
|
||||||
|
|
||||||
|
for (const resp of capturedResponses) {
|
||||||
|
console.log(`\n--- ${resp.url.substring(0, 80)} ---`);
|
||||||
|
const keys = Object.keys(resp.data);
|
||||||
|
console.log('Keys:', keys);
|
||||||
|
|
||||||
|
// Check for stores array
|
||||||
|
if (resp.data.stores && Array.isArray(resp.data.stores)) {
|
||||||
|
console.log(`Stores count: ${resp.data.stores.length}`);
|
||||||
|
const firstStore = resp.data.stores[0];
|
||||||
|
if (firstStore) {
|
||||||
|
console.log('First store keys:', Object.keys(firstStore));
|
||||||
|
console.log('Sample:', JSON.stringify(firstStore, null, 2).substring(0, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for hits (Algolia)
|
||||||
|
if (resp.data.hits && Array.isArray(resp.data.hits)) {
|
||||||
|
console.log(`Hits count: ${resp.data.hits.length}`);
|
||||||
|
const firstHit = resp.data.hits[0];
|
||||||
|
if (firstHit) {
|
||||||
|
console.log('First hit keys:', Object.keys(firstHit));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for __NEXT_DATA__ or similar embedded data
|
||||||
|
console.log('\n--- Checking for embedded page data ---');
|
||||||
|
const pageData = await page.evaluate(() => {
|
||||||
|
// Check for Next.js data
|
||||||
|
const nextData = (window as any).__NEXT_DATA__;
|
||||||
|
if (nextData?.props?.pageProps?.stores) {
|
||||||
|
return {
|
||||||
|
source: '__NEXT_DATA__',
|
||||||
|
storeCount: nextData.props.pageProps.stores.length,
|
||||||
|
firstStore: nextData.props.pageProps.stores[0],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for any global store data
|
||||||
|
const win = window as any;
|
||||||
|
if (win.stores) return { source: 'window.stores', data: win.stores };
|
||||||
|
if (win.__stores) return { source: 'window.__stores', data: win.__stores };
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (pageData) {
|
||||||
|
console.log('Found embedded data:', pageData.source);
|
||||||
|
console.log('Store count:', pageData.storeCount);
|
||||||
|
if (pageData.firstStore) {
|
||||||
|
console.log('First store keys:', Object.keys(pageData.firstStore));
|
||||||
|
console.log('Sample:', JSON.stringify({
|
||||||
|
id: pageData.firstStore.id,
|
||||||
|
name: pageData.firstStore.name,
|
||||||
|
city: pageData.firstStore.city,
|
||||||
|
state: pageData.firstStore.state,
|
||||||
|
}, null, 2));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log('No embedded page data found');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try alternative API endpoints from browser context
|
||||||
|
console.log('\n--- Testing alternative API endpoints ---');
|
||||||
|
|
||||||
|
// Try the map endpoint
|
||||||
|
const mapData = await page.evaluate(async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch('https://api.iheartjane.com/v1/stores/map?per_page=100');
|
||||||
|
if (res.ok) return await res.json();
|
||||||
|
} catch {}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (mapData) {
|
||||||
|
console.log('\n/v1/stores/map response:');
|
||||||
|
console.log('Keys:', Object.keys(mapData));
|
||||||
|
if (mapData.stores?.[0]) {
|
||||||
|
console.log('First store keys:', Object.keys(mapData.stores[0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try index endpoint
|
||||||
|
const indexData = await page.evaluate(async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch('https://api.iheartjane.com/v1/stores/index?per_page=10');
|
||||||
|
if (res.ok) return await res.json();
|
||||||
|
} catch {}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (indexData) {
|
||||||
|
console.log('\n/v1/stores/index response:');
|
||||||
|
console.log('Keys:', Object.keys(indexData));
|
||||||
|
if (indexData.stores?.[0]) {
|
||||||
|
console.log('First store keys:', Object.keys(indexData.stores[0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try with state parameter
|
||||||
|
const stateData = await page.evaluate(async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch('https://api.iheartjane.com/v1/stores?state=AZ&per_page=10');
|
||||||
|
if (res.ok) return await res.json();
|
||||||
|
} catch {}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (stateData) {
|
||||||
|
console.log('\n/v1/stores?state=AZ response:');
|
||||||
|
console.log('Keys:', Object.keys(stateData));
|
||||||
|
console.log('Stores count:', stateData.stores?.length);
|
||||||
|
if (stateData.stores?.[0]) {
|
||||||
|
console.log('First store keys:', Object.keys(stateData.stores[0]));
|
||||||
|
console.log('Sample:', JSON.stringify(stateData.stores[0], null, 2).substring(0, 300));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try Algolia directly for stores
|
||||||
|
console.log('\n--- Testing Algolia for stores ---');
|
||||||
|
const algoliaStores = await page.evaluate(async () => {
|
||||||
|
try {
|
||||||
|
// Common Algolia search pattern
|
||||||
|
const res = await fetch('https://search.iheartjane.com/1/indexes/stores-production/query', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'X-Algolia-Application-Id': 'HKXSXRD7RA',
|
||||||
|
'X-Algolia-API-Key': 'YjZhYjQxZjU4ZTNjMTRhYzExZTk2YjU2MzliMGE4ZTE5YjJkMmZkZTI2ODllYTY2MThlMzQ3Y2QxOTFkMjI5Y3RhZ0ZpbHRlcnM9',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
query: 'Arizona',
|
||||||
|
hitsPerPage: 20,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
if (res.ok) return await res.json();
|
||||||
|
} catch {}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (algoliaStores) {
|
||||||
|
console.log('Algolia stores-production response:');
|
||||||
|
console.log('Keys:', Object.keys(algoliaStores));
|
||||||
|
console.log('Hits count:', algoliaStores.hits?.length);
|
||||||
|
if (algoliaStores.hits?.[0]) {
|
||||||
|
console.log('First hit keys:', Object.keys(algoliaStores.hits[0]));
|
||||||
|
console.log('Sample:', JSON.stringify(algoliaStores.hits[0], null, 2).substring(0, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if there's a /v2 endpoint
|
||||||
|
const v2Data = await page.evaluate(async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch('https://api.iheartjane.com/v2/stores?per_page=10');
|
||||||
|
if (res.ok) return await res.json();
|
||||||
|
} catch {}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (v2Data) {
|
||||||
|
console.log('\n/v2/stores response:');
|
||||||
|
console.log('Keys:', Object.keys(v2Data));
|
||||||
|
if (v2Data.stores?.[0]) {
|
||||||
|
console.log('First store keys:', Object.keys(v2Data.stores[0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
console.log('\nDone!');
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
126
backend/scripts/test-jane-client.ts
Normal file
126
backend/scripts/test-jane-client.ts
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
/**
|
||||||
|
* Test script for Jane platform client
|
||||||
|
* Tests the new Jane integration with The Flower Shop Mesa
|
||||||
|
*
|
||||||
|
* Usage: npx ts-node scripts/test-jane-client.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
startSession,
|
||||||
|
endSession,
|
||||||
|
fetchProductsFromUrl,
|
||||||
|
resolveStoreFromUrl,
|
||||||
|
} from '../src/platforms/jane';
|
||||||
|
import { JaneNormalizer } from '../src/hydration/normalizers/jane';
|
||||||
|
|
||||||
|
const TEST_URL = 'https://theflowershopusa.com/mesa/menu/';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('Jane Platform Client Test');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Test URL: ${TEST_URL}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Test 1: Fetch products from URL
|
||||||
|
console.log('[Test 1] Fetching products from menu URL...');
|
||||||
|
const result = await fetchProductsFromUrl(TEST_URL);
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('[Results]');
|
||||||
|
console.log(` Store: ${result.store?.name || 'Not captured'}`);
|
||||||
|
console.log(` Store ID: ${result.store?.id || 'N/A'}`);
|
||||||
|
console.log(` Products captured: ${result.products.length}`);
|
||||||
|
console.log(` API responses: ${result.responses.length}`);
|
||||||
|
|
||||||
|
if (result.store) {
|
||||||
|
console.log('');
|
||||||
|
console.log('[Store Info]');
|
||||||
|
console.log(` Address: ${result.store.address}, ${result.store.city}, ${result.store.state} ${result.store.zip}`);
|
||||||
|
console.log(` Phone: ${result.store.phone}`);
|
||||||
|
console.log(` Coordinates: ${result.store.lat}, ${result.store.long}`);
|
||||||
|
console.log(` Medical: ${result.store.medical}, Recreational: ${result.store.recreational}`);
|
||||||
|
console.log(` Rating: ${result.store.rating} (${result.store.reviews_count} reviews)`);
|
||||||
|
console.log(` Product count (store): ${result.store.product_count}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.products.length > 0) {
|
||||||
|
console.log('');
|
||||||
|
console.log('[Sample Products (first 5)]');
|
||||||
|
for (const p of result.products.slice(0, 5)) {
|
||||||
|
const price = p.price_gram || p.price_each || 'N/A';
|
||||||
|
console.log(` - ${p.name} (${p.brand}) - $${price}`);
|
||||||
|
console.log(` Kind: ${p.kind}, Category: ${p.category}, THC: ${p.percent_thc}%`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 2: Normalize products
|
||||||
|
console.log('');
|
||||||
|
console.log('[Test 2] Testing normalizer...');
|
||||||
|
const normalizer = new JaneNormalizer();
|
||||||
|
|
||||||
|
// Build a fake payload structure
|
||||||
|
const fakePayload = {
|
||||||
|
id: 'test-payload',
|
||||||
|
dispensary_id: 9999,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'jane',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: { hits: result.products.map(p => p.raw) },
|
||||||
|
product_count: result.products.length,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const normalized = normalizer.normalize(fakePayload);
|
||||||
|
|
||||||
|
console.log(` Products normalized: ${normalized.products.length}`);
|
||||||
|
console.log(` Brands extracted: ${normalized.brands.length}`);
|
||||||
|
console.log(` Categories extracted: ${normalized.categories.length}`);
|
||||||
|
console.log(` Errors: ${normalized.errors.length}`);
|
||||||
|
|
||||||
|
if (normalized.products.length > 0) {
|
||||||
|
console.log('');
|
||||||
|
console.log('[Sample Normalized Product]');
|
||||||
|
const np = normalized.products[0];
|
||||||
|
console.log(` External ID: ${np.externalProductId}`);
|
||||||
|
console.log(` Name: ${np.name}`);
|
||||||
|
console.log(` Brand: ${np.brandName}`);
|
||||||
|
console.log(` Category: ${np.category}`);
|
||||||
|
console.log(` Type: ${np.type}`);
|
||||||
|
console.log(` Strain: ${np.strainType}`);
|
||||||
|
console.log(` THC: ${np.thcPercent}%`);
|
||||||
|
console.log(` CBD: ${np.cbdPercent}%`);
|
||||||
|
console.log(` Image: ${np.primaryImageUrl?.slice(0, 60)}...`);
|
||||||
|
|
||||||
|
const pricing = normalized.pricing.get(np.externalProductId);
|
||||||
|
if (pricing) {
|
||||||
|
console.log(` Price (cents): ${pricing.priceRec}`);
|
||||||
|
console.log(` On Special: ${pricing.isOnSpecial}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('TEST PASSED');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('');
|
||||||
|
console.error('='.repeat(60));
|
||||||
|
console.error('TEST FAILED');
|
||||||
|
console.error('='.repeat(60));
|
||||||
|
console.error(`Error: ${error.message}`);
|
||||||
|
console.error(error.stack);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
55
backend/scripts/test-jane-med-rec-compare.ts
Normal file
55
backend/scripts/test-jane-med-rec-compare.ts
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
/**
|
||||||
|
* Compare MED vs REC product menus for same location
|
||||||
|
*/
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox'] });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto('https://www.iheartjane.com/stores', { waitUntil: 'domcontentloaded' });
|
||||||
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
|
// Fetch REC products (store 3379)
|
||||||
|
const recProducts: number[] = await page.evaluate(async () => {
|
||||||
|
const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=3379' }),
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
return data.hits?.map((h: any) => h.product_id) || [];
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fetch MED products (store 4540)
|
||||||
|
const medProducts: number[] = await page.evaluate(async () => {
|
||||||
|
const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=4540' }),
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
return data.hits?.map((h: any) => h.product_id) || [];
|
||||||
|
});
|
||||||
|
|
||||||
|
const recSet = new Set(recProducts);
|
||||||
|
const medSet = new Set(medProducts);
|
||||||
|
|
||||||
|
const recOnly = recProducts.filter(id => !medSet.has(id)).length;
|
||||||
|
const medOnly = medProducts.filter(id => !recSet.has(id)).length;
|
||||||
|
const shared = recProducts.filter(id => medSet.has(id)).length;
|
||||||
|
|
||||||
|
console.log('\nHana Phoenix - MED vs REC comparison (100 products each):');
|
||||||
|
console.log(' REC products fetched:', recProducts.length);
|
||||||
|
console.log(' MED products fetched:', medProducts.length);
|
||||||
|
console.log(' REC-only:', recOnly);
|
||||||
|
console.log(' MED-only:', medOnly);
|
||||||
|
console.log(' Shared:', shared);
|
||||||
|
console.log(' Menus are:', shared === 0 ? 'COMPLETELY DIFFERENT' : shared === recProducts.length ? 'IDENTICAL' : 'PARTIALLY OVERLAPPING');
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
79
backend/scripts/test-jane-med-rec-diff.ts
Normal file
79
backend/scripts/test-jane-med-rec-diff.ts
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
/**
|
||||||
|
* Find ALL differing fields between MED and REC product payloads
|
||||||
|
*/
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox'] });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto('https://www.iheartjane.com/stores', { waitUntil: 'domcontentloaded' });
|
||||||
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
|
// Get full product payload from REC store
|
||||||
|
const recProduct = await page.evaluate(async () => {
|
||||||
|
const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ query: '', hitsPerPage: 1, filters: 'store_id=3379' }),
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
return data.hits?.[0];
|
||||||
|
});
|
||||||
|
|
||||||
|
const productId = recProduct?.product_id;
|
||||||
|
|
||||||
|
// Get same product from MED store
|
||||||
|
const medProduct = await page.evaluate(async (pid: number) => {
|
||||||
|
const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=4540' }),
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
return data.hits?.find((h: any) => h.product_id === pid);
|
||||||
|
}, productId);
|
||||||
|
|
||||||
|
console.log('Product:', recProduct?.name, '(ID:', productId, ')\n');
|
||||||
|
|
||||||
|
// Get all keys
|
||||||
|
const allKeys = new Set([...Object.keys(recProduct || {}), ...Object.keys(medProduct || {})]);
|
||||||
|
const sortedKeys = [...allKeys].sort();
|
||||||
|
|
||||||
|
console.log('=== ALL KEYS IN PAYLOAD ===');
|
||||||
|
console.log(sortedKeys.join(', '));
|
||||||
|
|
||||||
|
console.log('\n=== FIELDS THAT DIFFER ===');
|
||||||
|
let diffCount = 0;
|
||||||
|
for (const key of sortedKeys) {
|
||||||
|
const recVal = JSON.stringify(recProduct?.[key]);
|
||||||
|
const medVal = JSON.stringify(medProduct?.[key]);
|
||||||
|
if (recVal !== medVal) {
|
||||||
|
diffCount++;
|
||||||
|
console.log(`${key}:`);
|
||||||
|
console.log(` REC: ${recVal?.substring(0, 100)}`);
|
||||||
|
console.log(` MED: ${medVal?.substring(0, 100)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (diffCount === 0) {
|
||||||
|
console.log('(none - payloads are identical)');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for limit/allowance related fields
|
||||||
|
console.log('\n=== LIMIT-RELATED FIELDS ===');
|
||||||
|
const limitFields = sortedKeys.filter(k =>
|
||||||
|
k.includes('limit') || k.includes('max') || k.includes('allow') ||
|
||||||
|
k.includes('quantity') || k.includes('cart') || k.includes('medical') ||
|
||||||
|
k.includes('rec') || k.includes('weight')
|
||||||
|
);
|
||||||
|
for (const key of limitFields) {
|
||||||
|
console.log(`${key}: REC=${JSON.stringify(recProduct?.[key])} | MED=${JSON.stringify(medProduct?.[key])}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
35
backend/scripts/test-jane-payload.ts
Normal file
35
backend/scripts/test-jane-payload.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
/**
|
||||||
|
* Test script to capture and save full Jane payload
|
||||||
|
* Usage: npx ts-node scripts/test-jane-payload.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import { fetchProductsFromUrl } from '../src/platforms/jane';
|
||||||
|
|
||||||
|
const TEST_URL = 'https://theflowershopusa.com/mesa/menu/';
|
||||||
|
const OUTPUT_FILE = '/tmp/jane-test-payload.json';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Fetching Jane payload...');
|
||||||
|
|
||||||
|
const result = await fetchProductsFromUrl(TEST_URL);
|
||||||
|
|
||||||
|
// Build payload structure matching what would be saved
|
||||||
|
const payload = {
|
||||||
|
hits: result.products.map(p => p.raw),
|
||||||
|
store: result.store?.raw || null,
|
||||||
|
capturedAt: new Date().toISOString(),
|
||||||
|
platform: 'jane',
|
||||||
|
storeId: result.store?.id,
|
||||||
|
productCount: result.products.length,
|
||||||
|
responseCount: result.responses.length,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Save to file
|
||||||
|
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(payload, null, 2));
|
||||||
|
console.log(`\nPayload saved to: ${OUTPUT_FILE}`);
|
||||||
|
console.log(`Products: ${result.products.length}`);
|
||||||
|
console.log(`Size: ${Math.round(fs.statSync(OUTPUT_FILE).size / 1024)}KB`);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
113
backend/scripts/test-treez-client.ts
Normal file
113
backend/scripts/test-treez-client.ts
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
/**
|
||||||
|
* Test script for Treez platform client
|
||||||
|
* Tests the new Treez integration with Best Dispensary
|
||||||
|
*
|
||||||
|
* Usage: npx ts-node scripts/test-treez-client.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
fetchProductsByStoreId,
|
||||||
|
} from '../src/platforms/treez';
|
||||||
|
import { TreezNormalizer } from '../src/hydration/normalizers/treez';
|
||||||
|
|
||||||
|
const TEST_STORE_ID = 'best';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('Treez Platform Client Test');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Test Store: ${TEST_STORE_ID}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Test 1: Fetch products from store
|
||||||
|
console.log('[Test 1] Fetching products from Treez store...');
|
||||||
|
const result = await fetchProductsByStoreId(TEST_STORE_ID);
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('[Results]');
|
||||||
|
console.log(` Store: ${result.store.name}`);
|
||||||
|
console.log(` Store ID: ${result.store.storeId}`);
|
||||||
|
console.log(` Products captured: ${result.products.length}`);
|
||||||
|
console.log(` Scroll count: ${result.scrollCount}`);
|
||||||
|
|
||||||
|
if (result.products.length > 0) {
|
||||||
|
console.log('');
|
||||||
|
console.log('[Sample Products (first 5)]');
|
||||||
|
for (const p of result.products.slice(0, 5)) {
|
||||||
|
console.log(` - ${p.name}`);
|
||||||
|
console.log(` Brand: ${p.brand || 'N/A'}`);
|
||||||
|
console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`);
|
||||||
|
console.log(` Price: ${p.price ? '$' + p.price : 'N/A'}`);
|
||||||
|
console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 2: Normalize products
|
||||||
|
console.log('');
|
||||||
|
console.log('[Test 2] Testing normalizer...');
|
||||||
|
const normalizer = new TreezNormalizer();
|
||||||
|
|
||||||
|
// Build a fake payload structure
|
||||||
|
const fakePayload = {
|
||||||
|
id: 'test-payload',
|
||||||
|
dispensary_id: 9999,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'treez',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: { products: result.products },
|
||||||
|
product_count: result.products.length,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const normalized = normalizer.normalize(fakePayload);
|
||||||
|
|
||||||
|
console.log(` Products normalized: ${normalized.products.length}`);
|
||||||
|
console.log(` Brands extracted: ${normalized.brands.length}`);
|
||||||
|
console.log(` Categories extracted: ${normalized.categories.length}`);
|
||||||
|
console.log(` Errors: ${normalized.errors.length}`);
|
||||||
|
|
||||||
|
if (normalized.products.length > 0) {
|
||||||
|
console.log('');
|
||||||
|
console.log('[Sample Normalized Product]');
|
||||||
|
const np = normalized.products[0];
|
||||||
|
console.log(` External ID: ${np.externalProductId}`);
|
||||||
|
console.log(` Name: ${np.name}`);
|
||||||
|
console.log(` Brand: ${np.brandName}`);
|
||||||
|
console.log(` Category: ${np.category}`);
|
||||||
|
console.log(` Type: ${np.type}`);
|
||||||
|
console.log(` Strain: ${np.strainType}`);
|
||||||
|
console.log(` THC: ${np.thcPercent !== null ? np.thcPercent + '%' : 'N/A'}`);
|
||||||
|
console.log(` CBD: ${np.cbdPercent !== null ? np.cbdPercent + '%' : 'N/A'}`);
|
||||||
|
console.log(` Image: ${np.primaryImageUrl?.slice(0, 60) || 'N/A'}...`);
|
||||||
|
|
||||||
|
const pricing = normalized.pricing.get(np.externalProductId);
|
||||||
|
if (pricing) {
|
||||||
|
console.log(` Price (cents): ${pricing.priceRec}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('TEST PASSED');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('');
|
||||||
|
console.error('='.repeat(60));
|
||||||
|
console.error('TEST FAILED');
|
||||||
|
console.error('='.repeat(60));
|
||||||
|
console.error(`Error: ${error.message}`);
|
||||||
|
console.error(error.stack);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
559
backend/scripts/test-treez-discovery.ts
Normal file
559
backend/scripts/test-treez-discovery.ts
Normal file
@@ -0,0 +1,559 @@
|
|||||||
|
/**
|
||||||
|
* Treez Platform Smoke Test
|
||||||
|
*
|
||||||
|
* Discovers DOM structure and extracts products from Treez menu pages.
|
||||||
|
* Used to determine actual CSS selectors for the platform client.
|
||||||
|
*
|
||||||
|
* Usage: npx ts-node scripts/test-treez-discovery.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer, { Page } from 'puppeteer';
|
||||||
|
import puppeteerExtra from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
// Register stealth plugin (even though Treez doesn't use Cloudflare, good practice)
|
||||||
|
puppeteerExtra.use(StealthPlugin());
|
||||||
|
|
||||||
|
const TEST_URL = 'https://best.treez.io/onlinemenu/?customerType=ADULT';
|
||||||
|
const STORE_ID = 'best';
|
||||||
|
|
||||||
|
interface TreezProductRaw {
|
||||||
|
productId: string;
|
||||||
|
name: string;
|
||||||
|
brand: string;
|
||||||
|
category: string;
|
||||||
|
subcategory: string;
|
||||||
|
thcPercent: number | null;
|
||||||
|
cbdPercent: number | null;
|
||||||
|
price: number | null;
|
||||||
|
priceUnit: string;
|
||||||
|
imageUrl: string | null;
|
||||||
|
inStock: boolean;
|
||||||
|
weight: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scroll to load all products (infinite scroll)
|
||||||
|
*/
|
||||||
|
async function scrollToLoadAll(page: Page, maxScrolls = 30): Promise<number> {
|
||||||
|
let previousHeight = 0;
|
||||||
|
let scrollCount = 0;
|
||||||
|
let sameHeightCount = 0;
|
||||||
|
|
||||||
|
console.log('[Scroll] Starting infinite scroll...');
|
||||||
|
|
||||||
|
while (scrollCount < maxScrolls) {
|
||||||
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
sameHeightCount++;
|
||||||
|
if (sameHeightCount >= 3) {
|
||||||
|
console.log('[Scroll] No new content after 3 attempts, stopping');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sameHeightCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
|
await sleep(1500); // Wait for products to load
|
||||||
|
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
scrollCount++;
|
||||||
|
|
||||||
|
// Check how many products we have
|
||||||
|
const productCount = await page.evaluate(() => {
|
||||||
|
// Try multiple possible selectors
|
||||||
|
const selectors = [
|
||||||
|
'[class*="product"]',
|
||||||
|
'[class*="Product"]',
|
||||||
|
'[data-product]',
|
||||||
|
'.menu-item',
|
||||||
|
'[class*="card"]',
|
||||||
|
'[class*="Card"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const sel of selectors) {
|
||||||
|
const els = document.querySelectorAll(sel);
|
||||||
|
if (els.length > 10) return els.length;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Scroll] Scroll ${scrollCount}: height=${currentHeight}, products~${productCount}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return scrollCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze DOM structure to find product selectors
|
||||||
|
*/
|
||||||
|
async function analyzeDOM(page: Page): Promise<void> {
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('DOM STRUCTURE ANALYSIS');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
// Find elements with "product" in class name
|
||||||
|
const productClasses = await page.evaluate(() => {
|
||||||
|
const classes = new Set<string>();
|
||||||
|
document.querySelectorAll('*').forEach((el) => {
|
||||||
|
const className = el.className;
|
||||||
|
if (typeof className === 'string' && className.toLowerCase().includes('product')) {
|
||||||
|
className.split(' ').forEach((c) => {
|
||||||
|
if (c.toLowerCase().includes('product')) {
|
||||||
|
classes.add(c);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return Array.from(classes).slice(0, 20);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\n[Classes containing "product"]:');
|
||||||
|
productClasses.forEach((c: string) => console.log(` .${c}`));
|
||||||
|
|
||||||
|
// Find elements with "card" in class name
|
||||||
|
const cardClasses = await page.evaluate(() => {
|
||||||
|
const classes = new Set<string>();
|
||||||
|
document.querySelectorAll('*').forEach((el) => {
|
||||||
|
const className = el.className;
|
||||||
|
if (typeof className === 'string' && className.toLowerCase().includes('card')) {
|
||||||
|
className.split(' ').forEach((c) => {
|
||||||
|
if (c.toLowerCase().includes('card')) {
|
||||||
|
classes.add(c);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return Array.from(classes).slice(0, 20);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\n[Classes containing "card"]:');
|
||||||
|
cardClasses.forEach((c: string) => console.log(` .${c}`));
|
||||||
|
|
||||||
|
// Find data attributes
|
||||||
|
const dataAttrs = await page.evaluate(() => {
|
||||||
|
const attrs = new Set<string>();
|
||||||
|
document.querySelectorAll('*').forEach((el) => {
|
||||||
|
Array.from(el.attributes).forEach((attr) => {
|
||||||
|
if (attr.name.startsWith('data-') && !attr.name.includes('reactid')) {
|
||||||
|
attrs.add(attr.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return Array.from(attrs).slice(0, 30);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\n[Data attributes found]:');
|
||||||
|
dataAttrs.forEach((attr: string) => console.log(` ${attr}`));
|
||||||
|
|
||||||
|
// Get sample HTML of potential product container
|
||||||
|
const sampleHTML = await page.evaluate(() => {
|
||||||
|
// Try to find a product container
|
||||||
|
const selectors = [
|
||||||
|
'[class*="ProductCard"]',
|
||||||
|
'[class*="product-card"]',
|
||||||
|
'[class*="menuItem"]',
|
||||||
|
'[class*="menu-item"]',
|
||||||
|
'[data-testid*="product"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const sel of selectors) {
|
||||||
|
const el = document.querySelector(sel);
|
||||||
|
if (el) {
|
||||||
|
return {
|
||||||
|
selector: sel,
|
||||||
|
html: el.outerHTML.slice(0, 2000),
|
||||||
|
childCount: el.children.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: find repeating structures
|
||||||
|
const containers = document.querySelectorAll('div[class]');
|
||||||
|
const classCounts = new Map<string, number>();
|
||||||
|
|
||||||
|
containers.forEach((el) => {
|
||||||
|
if (el.children.length > 2 && el.className) {
|
||||||
|
classCounts.set(el.className, (classCounts.get(el.className) || 0) + 1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find class that appears many times (likely product cards)
|
||||||
|
let bestClass = '';
|
||||||
|
let bestCount = 0;
|
||||||
|
classCounts.forEach((count, className) => {
|
||||||
|
if (count > bestCount && count > 5) {
|
||||||
|
bestCount = count;
|
||||||
|
bestClass = className;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (bestClass) {
|
||||||
|
const el = document.querySelector(`.${bestClass.split(' ')[0]}`);
|
||||||
|
if (el) {
|
||||||
|
return {
|
||||||
|
selector: `.${bestClass.split(' ')[0]}`,
|
||||||
|
html: el.outerHTML.slice(0, 2000),
|
||||||
|
childCount: el.children.length,
|
||||||
|
count: bestCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (sampleHTML) {
|
||||||
|
console.log('\n[Sample Product Container]:');
|
||||||
|
console.log(` Selector: ${sampleHTML.selector}`);
|
||||||
|
console.log(` Children: ${sampleHTML.childCount}`);
|
||||||
|
if ((sampleHTML as any).count) {
|
||||||
|
console.log(` Occurrences: ${(sampleHTML as any).count}`);
|
||||||
|
}
|
||||||
|
console.log('\n[Sample HTML (first 1000 chars)]:');
|
||||||
|
console.log(sampleHTML.html.slice(0, 1000));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products using discovered selectors
|
||||||
|
* Based on DOM analysis of Treez/GapCommerce React app
|
||||||
|
*/
|
||||||
|
async function extractProducts(page: Page): Promise<TreezProductRaw[]> {
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('PRODUCT EXTRACTION');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
const products = await page.evaluate(() => {
|
||||||
|
const results: any[] = [];
|
||||||
|
|
||||||
|
// Treez uses classes like: product_product__ERWtJ
|
||||||
|
// Find all product cards using the discovered class patterns
|
||||||
|
const productSelectors = [
|
||||||
|
'[class*="product_product__"]', // Main product container
|
||||||
|
'[class*="ProductCard"]', // Alternative pattern
|
||||||
|
];
|
||||||
|
|
||||||
|
let productElements: Element[] = [];
|
||||||
|
|
||||||
|
for (const selector of productSelectors) {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
// Filter to only get the actual product cards, not child elements
|
||||||
|
const filtered = Array.from(elements).filter(el => {
|
||||||
|
// Must have a name element and price
|
||||||
|
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
||||||
|
const hasPrice = el.querySelector('[class*="price"]');
|
||||||
|
return hasName || hasPrice;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (filtered.length > 0) {
|
||||||
|
productElements = filtered;
|
||||||
|
console.log(`Found ${filtered.length} products with selector: ${selector}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dedupe - some cards may be captured multiple times
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
// Extract data from each product element
|
||||||
|
for (const el of productElements) {
|
||||||
|
try {
|
||||||
|
// Get product name - look for name class
|
||||||
|
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
||||||
|
const name = nameEl?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
if (!name || seen.has(name)) continue;
|
||||||
|
seen.add(name);
|
||||||
|
|
||||||
|
// Get product ID from link
|
||||||
|
const linkEl = el.querySelector('a[href*="/product/"]');
|
||||||
|
let productId = '';
|
||||||
|
if (linkEl) {
|
||||||
|
const href = linkEl.getAttribute('href') || '';
|
||||||
|
const match = href.match(/\/product\/([^\/\?]+)/);
|
||||||
|
productId = match ? match[1] : '';
|
||||||
|
}
|
||||||
|
if (!productId) {
|
||||||
|
productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get brand from the info section or product name parsing
|
||||||
|
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
||||||
|
let brand = brandEl?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
// Get price - look for price class with $ symbol
|
||||||
|
const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]');
|
||||||
|
const priceText = priceEl?.textContent || '';
|
||||||
|
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
||||||
|
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
||||||
|
|
||||||
|
// Get image URL
|
||||||
|
const imgEl = el.querySelector('img');
|
||||||
|
let imageUrl = imgEl?.getAttribute('src') || null;
|
||||||
|
// Handle Next.js image optimization URLs
|
||||||
|
if (imageUrl && imageUrl.includes('/_next/image')) {
|
||||||
|
const urlMatch = imageUrl.match(/url=([^&]+)/);
|
||||||
|
if (urlMatch) {
|
||||||
|
imageUrl = decodeURIComponent(urlMatch[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get text content for THC/CBD extraction
|
||||||
|
const text = el.textContent || '';
|
||||||
|
|
||||||
|
// Get THC/CBD - look for patterns like "THC 25.5%" or "25.5% THC"
|
||||||
|
const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) ||
|
||||||
|
text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||||
|
const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) ||
|
||||||
|
text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||||
|
const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null;
|
||||||
|
const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
||||||
|
|
||||||
|
// Get weight from name or text (e.g., "3.5G", "1G")
|
||||||
|
const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i) ||
|
||||||
|
text.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i);
|
||||||
|
const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null;
|
||||||
|
|
||||||
|
// Price unit from weight
|
||||||
|
let priceUnit = '';
|
||||||
|
if (weight) {
|
||||||
|
priceUnit = weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get category/strain type
|
||||||
|
const strainTypes = ['indica', 'sativa', 'hybrid'];
|
||||||
|
let subcategory = '';
|
||||||
|
const textLower = text.toLowerCase();
|
||||||
|
for (const strain of strainTypes) {
|
||||||
|
if (textLower.includes(strain)) {
|
||||||
|
subcategory = strain;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine category from various signals
|
||||||
|
let category = '';
|
||||||
|
const categoryPatterns = [
|
||||||
|
{ pattern: /flower|bud/i, category: 'flower' },
|
||||||
|
{ pattern: /vape|cart|pen/i, category: 'vape' },
|
||||||
|
{ pattern: /edible|gummy|chocolate/i, category: 'edible' },
|
||||||
|
{ pattern: /concentrate|dab|wax|shatter/i, category: 'concentrate' },
|
||||||
|
{ pattern: /pre.?roll|joint/i, category: 'pre-roll' },
|
||||||
|
{ pattern: /topical|balm|cream/i, category: 'topical' },
|
||||||
|
{ pattern: /tincture/i, category: 'tincture' },
|
||||||
|
];
|
||||||
|
for (const { pattern, category: cat } of categoryPatterns) {
|
||||||
|
if (pattern.test(text)) {
|
||||||
|
category = cat;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check stock status
|
||||||
|
const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out');
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
productId,
|
||||||
|
name,
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
subcategory,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
price,
|
||||||
|
priceUnit,
|
||||||
|
imageUrl,
|
||||||
|
inStock,
|
||||||
|
weight,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.log('Error extracting product:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
return products;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bypass age gate if present
|
||||||
|
*/
|
||||||
|
async function bypassAgeGate(page: Page): Promise<boolean> {
|
||||||
|
console.log('[Age Gate] Checking for age gate...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Wait for either age gate or main content
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]');
|
||||||
|
|
||||||
|
if (ageGate) {
|
||||||
|
console.log('[Age Gate] Age gate detected, clicking confirm button...');
|
||||||
|
|
||||||
|
// Click the submit button
|
||||||
|
const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]');
|
||||||
|
if (submitBtn) {
|
||||||
|
await submitBtn.click();
|
||||||
|
console.log('[Age Gate] Clicked confirm button');
|
||||||
|
|
||||||
|
// Wait for age gate to disappear and menu to load
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
// Wait for navigation or content change
|
||||||
|
await page.waitForFunction(
|
||||||
|
() => !document.querySelector('[data-testid="age-gate-modal"]'),
|
||||||
|
{ timeout: 10000 }
|
||||||
|
).catch(() => {
|
||||||
|
console.log('[Age Gate] Gate may still be visible, continuing anyway');
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Age Gate] Age gate bypassed');
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
console.log('[Age Gate] No submit button found');
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log('[Age Gate] No age gate detected');
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} catch (err: any) {
|
||||||
|
console.log(`[Age Gate] Error: ${err.message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('TREEZ PLATFORM SMOKE TEST');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Store ID: ${STORE_ID}`);
|
||||||
|
console.log(`URL: ${TEST_URL}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
const browser = await puppeteerExtra.launch({
|
||||||
|
headless: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// Set viewport
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
|
// Set user agent
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log('[Navigation] Going to Treez menu page...');
|
||||||
|
await page.goto(TEST_URL, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Navigation] Page loaded, waiting for React app...');
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
// Bypass age gate
|
||||||
|
await bypassAgeGate(page);
|
||||||
|
|
||||||
|
// Wait for menu content to load
|
||||||
|
console.log('[Navigation] Waiting for menu content...');
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
// Check if page loaded correctly
|
||||||
|
const pageTitle = await page.title();
|
||||||
|
console.log(`[Navigation] Page title: ${pageTitle}`);
|
||||||
|
|
||||||
|
// Take a screenshot for debugging
|
||||||
|
await page.screenshot({ path: '/tmp/treez-smoke-test.png', fullPage: false });
|
||||||
|
console.log('[Debug] Screenshot saved to /tmp/treez-smoke-test.png');
|
||||||
|
|
||||||
|
// Analyze DOM structure
|
||||||
|
await analyzeDOM(page);
|
||||||
|
|
||||||
|
// Scroll to load all products
|
||||||
|
await scrollToLoadAll(page);
|
||||||
|
|
||||||
|
// Extract products
|
||||||
|
const products = await extractProducts(page);
|
||||||
|
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('RESULTS');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Total products extracted: ${products.length}`);
|
||||||
|
|
||||||
|
if (products.length > 0) {
|
||||||
|
// Show statistics
|
||||||
|
const withPrice = products.filter((p) => p.price !== null).length;
|
||||||
|
const withThc = products.filter((p) => p.thcPercent !== null).length;
|
||||||
|
const withBrand = products.filter((p) => p.brand).length;
|
||||||
|
const withImage = products.filter((p) => p.imageUrl).length;
|
||||||
|
|
||||||
|
console.log(`\n[Data Quality]`);
|
||||||
|
console.log(` With price: ${withPrice}/${products.length} (${Math.round((withPrice / products.length) * 100)}%)`);
|
||||||
|
console.log(` With THC%: ${withThc}/${products.length} (${Math.round((withThc / products.length) * 100)}%)`);
|
||||||
|
console.log(` With brand: ${withBrand}/${products.length} (${Math.round((withBrand / products.length) * 100)}%)`);
|
||||||
|
console.log(` With image: ${withImage}/${products.length} (${Math.round((withImage / products.length) * 100)}%)`);
|
||||||
|
|
||||||
|
// Show sample products
|
||||||
|
console.log('\n[Sample Products (first 10)]:');
|
||||||
|
for (const p of products.slice(0, 10)) {
|
||||||
|
console.log(`\n ${p.name}`);
|
||||||
|
console.log(` ID: ${p.productId}`);
|
||||||
|
console.log(` Brand: ${p.brand || 'N/A'}`);
|
||||||
|
console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`);
|
||||||
|
console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`);
|
||||||
|
console.log(` CBD: ${p.cbdPercent !== null ? p.cbdPercent + '%' : 'N/A'}`);
|
||||||
|
console.log(` Price: ${p.price !== null ? '$' + p.price : 'N/A'} ${p.priceUnit}`);
|
||||||
|
console.log(` Weight: ${p.weight || 'N/A'}`);
|
||||||
|
console.log(` Image: ${p.imageUrl?.slice(0, 60) || 'N/A'}...`);
|
||||||
|
console.log(` In Stock: ${p.inStock}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save full results to file
|
||||||
|
const fs = await import('fs');
|
||||||
|
fs.writeFileSync('/tmp/treez-products.json', JSON.stringify(products, null, 2));
|
||||||
|
console.log('\n[Debug] Full product list saved to /tmp/treez-products.json');
|
||||||
|
} else {
|
||||||
|
console.log('\n[WARNING] No products extracted!');
|
||||||
|
console.log('Check /tmp/treez-smoke-test.png for page state');
|
||||||
|
|
||||||
|
// Dump page HTML for debugging
|
||||||
|
const html = await page.content();
|
||||||
|
const fs = await import('fs');
|
||||||
|
fs.writeFileSync('/tmp/treez-page.html', html);
|
||||||
|
console.log('[Debug] Page HTML saved to /tmp/treez-page.html');
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log(products.length > 0 ? 'SMOKE TEST PASSED' : 'SMOKE TEST NEEDS ADJUSTMENT');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n' + '='.repeat(60));
|
||||||
|
console.error('SMOKE TEST FAILED');
|
||||||
|
console.error('='.repeat(60));
|
||||||
|
console.error(`Error: ${error.message}`);
|
||||||
|
console.error(error.stack);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
227
backend/src/hydration/normalizers/treez.ts
Normal file
227
backend/src/hydration/normalizers/treez.ts
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
/**
|
||||||
|
* Treez Platform Normalizer
|
||||||
|
*
|
||||||
|
* Normalizes raw Treez DOM-scraped product data to canonical format.
|
||||||
|
*
|
||||||
|
* Treez is scraped via Puppeteer (no API), so the raw format is
|
||||||
|
* the TreezProductRaw interface from our client.
|
||||||
|
*
|
||||||
|
* Key differences from Dutchie/Jane:
|
||||||
|
* - Data comes from DOM parsing, not API response
|
||||||
|
* - Price is a single value (not multiple weights like Jane)
|
||||||
|
* - Product ID is generated from product name or URL slug
|
||||||
|
* - Less structured data (category/strain inferred from text)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { BaseNormalizer } from './base';
|
||||||
|
import {
|
||||||
|
NormalizedProduct,
|
||||||
|
NormalizedPricing,
|
||||||
|
NormalizedAvailability,
|
||||||
|
NormalizedBrand,
|
||||||
|
NormalizedCategory,
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
export class TreezNormalizer extends BaseNormalizer {
|
||||||
|
readonly platform = 'treez';
|
||||||
|
readonly supportedVersions = [1];
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// EXTRACTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
extractProducts(rawJson: any): any[] {
|
||||||
|
// Treez payload format: { products: [...] }
|
||||||
|
if (rawJson?.products && Array.isArray(rawJson.products)) {
|
||||||
|
return rawJson.products;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Direct array of products
|
||||||
|
if (Array.isArray(rawJson)) {
|
||||||
|
return rawJson;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hits array (normalized format)
|
||||||
|
if (rawJson?.hits && Array.isArray(rawJson.hits)) {
|
||||||
|
return rawJson.hits;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.warn('[TreezNormalizer] Could not extract products from payload');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
validatePayload(rawJson: any): { valid: boolean; errors: string[] } {
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
if (!rawJson) {
|
||||||
|
errors.push('Payload is null or undefined');
|
||||||
|
return { valid: false, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
const products = this.extractProducts(rawJson);
|
||||||
|
if (products.length === 0) {
|
||||||
|
errors.push('No products found in payload');
|
||||||
|
}
|
||||||
|
|
||||||
|
return { valid: errors.length === 0, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// NORMALIZATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
protected normalizeProduct(rawProduct: any, dispensaryId: number): NormalizedProduct | null {
|
||||||
|
const externalId = rawProduct.productId;
|
||||||
|
if (!externalId) {
|
||||||
|
console.warn('[TreezNormalizer] Product missing ID, skipping');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const name = rawProduct.name;
|
||||||
|
if (!name) {
|
||||||
|
console.warn(`[TreezNormalizer] Product ${externalId} missing name, skipping`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
externalProductId: String(externalId),
|
||||||
|
dispensaryId,
|
||||||
|
platform: 'treez',
|
||||||
|
platformDispensaryId: '', // Will be set by handler
|
||||||
|
|
||||||
|
// Core fields
|
||||||
|
name,
|
||||||
|
brandName: rawProduct.brand || null,
|
||||||
|
brandId: null, // Treez doesn't expose brand IDs
|
||||||
|
category: this.normalizeCategory(rawProduct.category) || null,
|
||||||
|
subcategory: rawProduct.subcategory || null,
|
||||||
|
type: rawProduct.category || null,
|
||||||
|
strainType: rawProduct.subcategory || null, // indica, sativa, hybrid
|
||||||
|
|
||||||
|
// Potency
|
||||||
|
thcPercent: rawProduct.thcPercent ?? null,
|
||||||
|
cbdPercent: rawProduct.cbdPercent ?? null,
|
||||||
|
thcContent: rawProduct.thcPercent ?? null,
|
||||||
|
cbdContent: rawProduct.cbdPercent ?? null,
|
||||||
|
|
||||||
|
// Status - scraped products are active
|
||||||
|
status: 'Active',
|
||||||
|
isActive: rawProduct.inStock !== false,
|
||||||
|
medicalOnly: false,
|
||||||
|
recOnly: false,
|
||||||
|
|
||||||
|
// Images
|
||||||
|
primaryImageUrl: rawProduct.imageUrl || null,
|
||||||
|
images: rawProduct.imageUrl
|
||||||
|
? [{ url: rawProduct.imageUrl, position: 0 }]
|
||||||
|
: [],
|
||||||
|
|
||||||
|
// Raw reference
|
||||||
|
rawProduct,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protected normalizePricing(rawProduct: any): NormalizedPricing | null {
|
||||||
|
const externalId = rawProduct.productId;
|
||||||
|
if (!externalId) return null;
|
||||||
|
|
||||||
|
const price = rawProduct.price;
|
||||||
|
|
||||||
|
return {
|
||||||
|
externalProductId: String(externalId),
|
||||||
|
|
||||||
|
// Treez typically shows a single price
|
||||||
|
priceRec: this.toCents(price),
|
||||||
|
priceRecMin: this.toCents(price),
|
||||||
|
priceRecMax: this.toCents(price),
|
||||||
|
priceRecSpecial: null,
|
||||||
|
|
||||||
|
// Treez doesn't distinguish med pricing in DOM
|
||||||
|
priceMed: null,
|
||||||
|
priceMedMin: null,
|
||||||
|
priceMedMax: null,
|
||||||
|
priceMedSpecial: null,
|
||||||
|
|
||||||
|
isOnSpecial: false,
|
||||||
|
specialName: null,
|
||||||
|
discountPercent: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protected normalizeAvailability(rawProduct: any): NormalizedAvailability | null {
|
||||||
|
const externalId = rawProduct.productId;
|
||||||
|
if (!externalId) return null;
|
||||||
|
|
||||||
|
const inStock = rawProduct.inStock !== false;
|
||||||
|
|
||||||
|
return {
|
||||||
|
externalProductId: String(externalId),
|
||||||
|
inStock,
|
||||||
|
stockStatus: inStock ? 'in_stock' : 'out_of_stock',
|
||||||
|
quantity: null, // Treez doesn't expose quantity in DOM
|
||||||
|
quantityAvailable: null,
|
||||||
|
isBelowThreshold: false,
|
||||||
|
optionsBelowThreshold: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protected extractBrand(rawProduct: any): NormalizedBrand | null {
|
||||||
|
const brandName = rawProduct.brand;
|
||||||
|
if (!brandName) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
externalBrandId: null, // Treez doesn't expose brand IDs
|
||||||
|
name: brandName,
|
||||||
|
slug: this.slugify(brandName),
|
||||||
|
logoUrl: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protected extractCategory(rawProduct: any): NormalizedCategory | null {
|
||||||
|
const categoryName = rawProduct.category;
|
||||||
|
if (!categoryName) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: this.normalizeCategory(categoryName) || categoryName,
|
||||||
|
slug: this.slugify(categoryName),
|
||||||
|
parentCategory: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// HELPERS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize category name to standard format
|
||||||
|
*/
|
||||||
|
private normalizeCategory(category: string | null | undefined): string | null {
|
||||||
|
if (!category) return null;
|
||||||
|
|
||||||
|
const categoryLower = category.toLowerCase().trim();
|
||||||
|
|
||||||
|
const categoryMap: Record<string, string> = {
|
||||||
|
flower: 'Flower',
|
||||||
|
vape: 'Vape',
|
||||||
|
vapes: 'Vape',
|
||||||
|
cartridge: 'Vape',
|
||||||
|
edible: 'Edible',
|
||||||
|
edibles: 'Edible',
|
||||||
|
concentrate: 'Concentrate',
|
||||||
|
concentrates: 'Concentrate',
|
||||||
|
'pre-roll': 'Pre-Roll',
|
||||||
|
preroll: 'Pre-Roll',
|
||||||
|
'pre-rolls': 'Pre-Roll',
|
||||||
|
prerolls: 'Pre-Roll',
|
||||||
|
topical: 'Topical',
|
||||||
|
topicals: 'Topical',
|
||||||
|
tincture: 'Tincture',
|
||||||
|
tinctures: 'Tincture',
|
||||||
|
accessory: 'Accessory',
|
||||||
|
accessories: 'Accessory',
|
||||||
|
gear: 'Gear',
|
||||||
|
};
|
||||||
|
|
||||||
|
return categoryMap[categoryLower] || category;
|
||||||
|
}
|
||||||
|
}
|
||||||
570
backend/src/platforms/treez/client.ts
Normal file
570
backend/src/platforms/treez/client.ts
Normal file
@@ -0,0 +1,570 @@
|
|||||||
|
/**
|
||||||
|
* ============================================================
|
||||||
|
* TREEZ PLATFORM CLIENT
|
||||||
|
* ============================================================
|
||||||
|
*
|
||||||
|
* Treez is a fully client-side rendered platform (React/Next.js).
|
||||||
|
* Unlike Dutchie (GraphQL) or Jane (Algolia), Treez requires DOM
|
||||||
|
* parsing after page render. No API endpoints are available.
|
||||||
|
*
|
||||||
|
* Key differences:
|
||||||
|
* - No Cloudflare protection (simpler than Jane)
|
||||||
|
* - Products loaded via infinite scroll
|
||||||
|
* - Data extracted from DOM elements
|
||||||
|
* - Age gate must be bypassed
|
||||||
|
*
|
||||||
|
* URL Pattern: https://{storeId}.treez.io/onlinemenu/?customerType=ADULT
|
||||||
|
* Store ID Format: String slug (e.g., "best")
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||||
|
import puppeteerExtra from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||||
|
|
||||||
|
// Register stealth plugin (good practice even without Cloudflare)
|
||||||
|
puppeteerExtra.use(StealthPlugin());
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface TreezProductRaw {
|
||||||
|
productId: string;
|
||||||
|
name: string;
|
||||||
|
brand: string;
|
||||||
|
category: string;
|
||||||
|
subcategory: string; // indica, sativa, hybrid
|
||||||
|
thcPercent: number | null;
|
||||||
|
cbdPercent: number | null;
|
||||||
|
price: number | null;
|
||||||
|
priceUnit: string;
|
||||||
|
imageUrl: string | null;
|
||||||
|
inStock: boolean;
|
||||||
|
weight: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TreezSession {
|
||||||
|
sessionId: string;
|
||||||
|
browser: Browser;
|
||||||
|
page: Page;
|
||||||
|
fingerprint: BrowserFingerprint;
|
||||||
|
proxyUrl: string | null;
|
||||||
|
startedAt: Date;
|
||||||
|
storeId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TreezStoreInfo {
|
||||||
|
storeId: string;
|
||||||
|
name: string;
|
||||||
|
url: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const TREEZ_CONFIG = {
|
||||||
|
baseUrl: 'https://{storeId}.treez.io/onlinemenu/',
|
||||||
|
timeout: 60000,
|
||||||
|
navigationTimeout: 60000,
|
||||||
|
scrollDelay: 1500,
|
||||||
|
maxScrollAttempts: 50,
|
||||||
|
ageGateDelay: 2000,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SESSION MANAGEMENT
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
let currentSession: TreezSession | null = null;
|
||||||
|
let crawlRotator: CrawlRotator | null = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set CrawlRotator for proxy/fingerprint management
|
||||||
|
*/
|
||||||
|
export function setCrawlRotator(rotator: CrawlRotator | null): void {
|
||||||
|
crawlRotator = rotator;
|
||||||
|
if (rotator) {
|
||||||
|
console.log('[Treez Client] CrawlRotator attached');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get attached CrawlRotator
|
||||||
|
*/
|
||||||
|
export function getCrawlRotator(): CrawlRotator | null {
|
||||||
|
return crawlRotator;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start a new Treez browser session
|
||||||
|
*/
|
||||||
|
export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||||
|
if (currentSession) {
|
||||||
|
console.log('[Treez Client] Closing existing session before starting new one');
|
||||||
|
await endSession();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get fingerprint from rotator or use defaults
|
||||||
|
let fingerprint: BrowserFingerprint;
|
||||||
|
let proxyUrl: string | null = null;
|
||||||
|
|
||||||
|
if (crawlRotator) {
|
||||||
|
fingerprint = crawlRotator.userAgent.getCurrent();
|
||||||
|
const proxy = crawlRotator.proxy.getCurrent();
|
||||||
|
if (proxy) {
|
||||||
|
proxyUrl = crawlRotator.proxy.getProxyUrl(proxy);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Default fingerprint for local testing
|
||||||
|
fingerprint = {
|
||||||
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||||
|
browserName: 'Chrome',
|
||||||
|
deviceCategory: 'desktop',
|
||||||
|
platform: 'Windows',
|
||||||
|
screenWidth: 1920,
|
||||||
|
screenHeight: 1080,
|
||||||
|
viewportWidth: 1920,
|
||||||
|
viewportHeight: 1080,
|
||||||
|
acceptLanguage: 'en-US,en;q=0.9',
|
||||||
|
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||||
|
secChUaPlatform: '"Windows"',
|
||||||
|
secChUaMobile: '?0',
|
||||||
|
httpFingerprint: {
|
||||||
|
browserType: 'Chrome' as const,
|
||||||
|
headers: {},
|
||||||
|
headerOrder: [],
|
||||||
|
curlImpersonateBinary: 'curl_chrome131',
|
||||||
|
hasDNT: false,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build browser args
|
||||||
|
const browserArgs = [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
];
|
||||||
|
|
||||||
|
if (proxyUrl) {
|
||||||
|
const proxyMatch = proxyUrl.match(/:\/\/([^@]+@)?([^/]+)/);
|
||||||
|
if (proxyMatch) {
|
||||||
|
browserArgs.push(`--proxy-server=${proxyMatch[2]}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[Treez Client] Launching browser...');
|
||||||
|
const browser = await puppeteerExtra.launch({
|
||||||
|
headless: true,
|
||||||
|
args: browserArgs,
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// Set viewport
|
||||||
|
await page.setViewport({
|
||||||
|
width: fingerprint.viewportWidth || 1920,
|
||||||
|
height: fingerprint.viewportHeight || 1080,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Set user agent
|
||||||
|
await page.setUserAgent(fingerprint.userAgent);
|
||||||
|
|
||||||
|
// Block unnecessary resources to save bandwidth
|
||||||
|
// We only need HTML/JS for DOM extraction - not images, fonts, etc.
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (request) => {
|
||||||
|
const resourceType = request.resourceType();
|
||||||
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
||||||
|
request.abort();
|
||||||
|
} else {
|
||||||
|
request.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle proxy authentication if needed
|
||||||
|
if (proxyUrl) {
|
||||||
|
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
|
||||||
|
if (authMatch) {
|
||||||
|
await page.authenticate({
|
||||||
|
username: authMatch[1],
|
||||||
|
password: authMatch[2],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||||
|
|
||||||
|
currentSession = {
|
||||||
|
sessionId,
|
||||||
|
browser,
|
||||||
|
page,
|
||||||
|
fingerprint,
|
||||||
|
proxyUrl,
|
||||||
|
startedAt: new Date(),
|
||||||
|
storeId,
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`[Treez Client] Started session ${sessionId}`);
|
||||||
|
console.log(`[Treez Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`);
|
||||||
|
if (proxyUrl) {
|
||||||
|
console.log(`[Treez Client] Proxy: ${proxyUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return currentSession;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* End the current browser session
|
||||||
|
*/
|
||||||
|
export async function endSession(): Promise<void> {
|
||||||
|
if (currentSession) {
|
||||||
|
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
|
||||||
|
console.log(`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s)`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await currentSession.browser.close();
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('[Treez Client] Error closing browser:', e);
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSession = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current active session
|
||||||
|
*/
|
||||||
|
export function getCurrentSession(): TreezSession | null {
|
||||||
|
return currentSession;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// AGE GATE HANDLING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bypass age gate if present
|
||||||
|
*/
|
||||||
|
export async function bypassAgeGate(page: Page): Promise<boolean> {
|
||||||
|
console.log('[Treez Client] Checking for age gate...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]');
|
||||||
|
|
||||||
|
if (ageGate) {
|
||||||
|
console.log('[Treez Client] Age gate detected, clicking confirm button...');
|
||||||
|
|
||||||
|
const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]');
|
||||||
|
if (submitBtn) {
|
||||||
|
await submitBtn.click();
|
||||||
|
console.log('[Treez Client] Clicked confirm button');
|
||||||
|
|
||||||
|
await sleep(TREEZ_CONFIG.ageGateDelay);
|
||||||
|
|
||||||
|
// Wait for age gate to disappear
|
||||||
|
await page.waitForFunction(
|
||||||
|
() => !document.querySelector('[data-testid="age-gate-modal"]'),
|
||||||
|
{ timeout: 10000 }
|
||||||
|
).catch(() => {
|
||||||
|
console.log('[Treez Client] Gate may still be visible, continuing anyway');
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Treez Client] Age gate bypassed');
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
console.log('[Treez Client] No submit button found');
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log('[Treez Client] No age gate detected');
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} catch (err: any) {
|
||||||
|
console.log(`[Treez Client] Age gate error: ${err.message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// NAVIGATION & SCRAPING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build menu URL for a store
|
||||||
|
*/
|
||||||
|
export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||||
|
return `https://${storeId}.treez.io/onlinemenu/?customerType=${customerType}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Navigate to a store's menu page
|
||||||
|
*/
|
||||||
|
export async function navigateToMenu(storeId: string): Promise<void> {
|
||||||
|
if (!currentSession) {
|
||||||
|
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||||
|
}
|
||||||
|
|
||||||
|
const { page } = currentSession;
|
||||||
|
const url = buildMenuUrl(storeId);
|
||||||
|
|
||||||
|
console.log(`[Treez Client] Navigating to ${url}`);
|
||||||
|
|
||||||
|
await page.goto(url, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for React app to render
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
// Bypass age gate
|
||||||
|
await bypassAgeGate(page);
|
||||||
|
|
||||||
|
// Wait for content to load
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
console.log('[Treez Client] Menu page loaded');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scroll to load all products (infinite scroll)
|
||||||
|
*/
|
||||||
|
export async function scrollToLoadAll(page: Page): Promise<number> {
|
||||||
|
let previousHeight = 0;
|
||||||
|
let scrollCount = 0;
|
||||||
|
let sameHeightCount = 0;
|
||||||
|
|
||||||
|
console.log('[Treez Client] Starting infinite scroll...');
|
||||||
|
|
||||||
|
while (scrollCount < TREEZ_CONFIG.maxScrollAttempts) {
|
||||||
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
sameHeightCount++;
|
||||||
|
if (sameHeightCount >= 3) {
|
||||||
|
console.log('[Treez Client] No new content after 3 attempts, stopping');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sameHeightCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
|
await sleep(TREEZ_CONFIG.scrollDelay);
|
||||||
|
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
scrollCount++;
|
||||||
|
|
||||||
|
if (scrollCount % 5 === 0) {
|
||||||
|
const productCount = await page.evaluate(() => {
|
||||||
|
return document.querySelectorAll('[class*="product_product__"]').length;
|
||||||
|
});
|
||||||
|
console.log(`[Treez Client] Scroll ${scrollCount}: ${productCount} products loaded`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return scrollCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products from the current page
|
||||||
|
*/
|
||||||
|
export async function extractProducts(page: Page): Promise<TreezProductRaw[]> {
|
||||||
|
console.log('[Treez Client] Extracting products from DOM...');
|
||||||
|
|
||||||
|
const products = await page.evaluate(() => {
|
||||||
|
const results: any[] = [];
|
||||||
|
|
||||||
|
// Find all product cards
|
||||||
|
const productElements = Array.from(
|
||||||
|
document.querySelectorAll('[class*="product_product__"]')
|
||||||
|
).filter(el => {
|
||||||
|
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
||||||
|
const hasPrice = el.querySelector('[class*="price"]');
|
||||||
|
return hasName || hasPrice;
|
||||||
|
});
|
||||||
|
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
for (const el of productElements) {
|
||||||
|
try {
|
||||||
|
// Get product name
|
||||||
|
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
||||||
|
const name = nameEl?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
if (!name || seen.has(name)) continue;
|
||||||
|
seen.add(name);
|
||||||
|
|
||||||
|
// Get product ID from link
|
||||||
|
const linkEl = el.querySelector('a[href*="/product/"]');
|
||||||
|
let productId = '';
|
||||||
|
if (linkEl) {
|
||||||
|
const href = linkEl.getAttribute('href') || '';
|
||||||
|
const match = href.match(/\/product\/([^\/\?]+)/);
|
||||||
|
productId = match ? match[1] : '';
|
||||||
|
}
|
||||||
|
if (!productId) {
|
||||||
|
productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get brand
|
||||||
|
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
||||||
|
const brand = brandEl?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
// Get price
|
||||||
|
const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]');
|
||||||
|
const priceText = priceEl?.textContent || '';
|
||||||
|
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
||||||
|
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
||||||
|
|
||||||
|
// Get image URL
|
||||||
|
const imgEl = el.querySelector('img');
|
||||||
|
let imageUrl = imgEl?.getAttribute('src') || null;
|
||||||
|
if (imageUrl && imageUrl.includes('/_next/image')) {
|
||||||
|
const urlMatch = imageUrl.match(/url=([^&]+)/);
|
||||||
|
if (urlMatch) {
|
||||||
|
imageUrl = decodeURIComponent(urlMatch[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get text content for data extraction
|
||||||
|
const text = el.textContent || '';
|
||||||
|
const textLower = text.toLowerCase();
|
||||||
|
|
||||||
|
// Get THC/CBD
|
||||||
|
const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) ||
|
||||||
|
text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||||
|
const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) ||
|
||||||
|
text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||||
|
const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null;
|
||||||
|
const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
||||||
|
|
||||||
|
// Get weight from name
|
||||||
|
const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i);
|
||||||
|
const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null;
|
||||||
|
|
||||||
|
// Determine category from weight and name (not full text to avoid nav pollution)
|
||||||
|
let category = '';
|
||||||
|
|
||||||
|
// Check explicit category patterns in NAME ONLY (not full text)
|
||||||
|
// This avoids false positives from navigation elements
|
||||||
|
const categoryPatterns = [
|
||||||
|
{ pattern: /vape|cart(?:ridge)?|pen|pod/i, category: 'vape' },
|
||||||
|
{ pattern: /edible|gummy|gummies|chocolate|candy/i, category: 'edible' },
|
||||||
|
{ pattern: /concentrate|dab|wax|shatter|rosin|resin/i, category: 'concentrate' },
|
||||||
|
{ pattern: /pre.?roll|joint|blunt/i, category: 'pre-roll' },
|
||||||
|
{ pattern: /topical|balm|cream|lotion/i, category: 'topical' },
|
||||||
|
{ pattern: /tincture/i, category: 'tincture' },
|
||||||
|
];
|
||||||
|
for (const { pattern, category: cat } of categoryPatterns) {
|
||||||
|
if (pattern.test(name)) {
|
||||||
|
category = cat;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no explicit category found, infer from weight
|
||||||
|
if (!category && weight) {
|
||||||
|
const weightLower = weight.toLowerCase();
|
||||||
|
if (weightLower.includes('g') && !weightLower.includes('mg')) {
|
||||||
|
// Gram weights (3.5g, 1g, 7g, etc.) are typically flower
|
||||||
|
category = 'flower';
|
||||||
|
} else if (weightLower.includes('mg')) {
|
||||||
|
// Milligram weights are typically edibles
|
||||||
|
category = 'edible';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get strain type
|
||||||
|
const strainTypes = ['indica', 'sativa', 'hybrid'];
|
||||||
|
let subcategory = '';
|
||||||
|
for (const strain of strainTypes) {
|
||||||
|
if (textLower.includes(strain)) {
|
||||||
|
subcategory = strain;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check stock status
|
||||||
|
const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out');
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
productId,
|
||||||
|
name,
|
||||||
|
brand,
|
||||||
|
category,
|
||||||
|
subcategory,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
price,
|
||||||
|
priceUnit: weight || '',
|
||||||
|
imageUrl,
|
||||||
|
inStock,
|
||||||
|
weight,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
// Skip products that fail extraction
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Treez Client] Extracted ${products.length} products`);
|
||||||
|
return products;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch all products from a store
|
||||||
|
* Main entry point for product discovery
|
||||||
|
*/
|
||||||
|
export async function fetchAllProducts(storeId: string): Promise<{
|
||||||
|
products: TreezProductRaw[];
|
||||||
|
storeInfo: TreezStoreInfo;
|
||||||
|
scrollCount: number;
|
||||||
|
}> {
|
||||||
|
if (!currentSession) {
|
||||||
|
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||||
|
}
|
||||||
|
|
||||||
|
const { page } = currentSession;
|
||||||
|
|
||||||
|
// Navigate to menu
|
||||||
|
await navigateToMenu(storeId);
|
||||||
|
|
||||||
|
// Get page title for store info
|
||||||
|
const pageTitle = await page.title();
|
||||||
|
const storeInfo: TreezStoreInfo = {
|
||||||
|
storeId,
|
||||||
|
name: pageTitle.split('|')[1]?.trim() || pageTitle,
|
||||||
|
url: buildMenuUrl(storeId),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Scroll to load all products
|
||||||
|
const scrollCount = await scrollToLoadAll(page);
|
||||||
|
|
||||||
|
// Extract products
|
||||||
|
const products = await extractProducts(page);
|
||||||
|
|
||||||
|
// Record success if we got products
|
||||||
|
if (crawlRotator && products.length > 0) {
|
||||||
|
await crawlRotator.recordSuccess();
|
||||||
|
}
|
||||||
|
|
||||||
|
return { products, storeInfo, scrollCount };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// UTILITY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
50
backend/src/platforms/treez/index.ts
Normal file
50
backend/src/platforms/treez/index.ts
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
/**
|
||||||
|
* Treez Platform Module
|
||||||
|
*
|
||||||
|
* Single export point for all Treez communication.
|
||||||
|
* All Treez workers MUST import from this module.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export {
|
||||||
|
// Session Management
|
||||||
|
startSession,
|
||||||
|
endSession,
|
||||||
|
getCurrentSession,
|
||||||
|
|
||||||
|
// Proxy/Rotation
|
||||||
|
setCrawlRotator,
|
||||||
|
getCrawlRotator,
|
||||||
|
|
||||||
|
// Core Operations
|
||||||
|
navigateToMenu,
|
||||||
|
scrollToLoadAll,
|
||||||
|
extractProducts,
|
||||||
|
fetchAllProducts,
|
||||||
|
bypassAgeGate,
|
||||||
|
|
||||||
|
// URL Building
|
||||||
|
buildMenuUrl,
|
||||||
|
|
||||||
|
// Configuration
|
||||||
|
TREEZ_CONFIG,
|
||||||
|
|
||||||
|
// Types
|
||||||
|
type TreezSession,
|
||||||
|
type TreezStoreInfo,
|
||||||
|
type TreezProductRaw,
|
||||||
|
} from './client';
|
||||||
|
|
||||||
|
// High-level Query Functions
|
||||||
|
export {
|
||||||
|
fetchProductsByStoreId,
|
||||||
|
fetchProductsFromUrl,
|
||||||
|
extractStoreIdFromUrl,
|
||||||
|
validateStoreId,
|
||||||
|
getMenuUrl,
|
||||||
|
|
||||||
|
// Types
|
||||||
|
type FetchProductsResult,
|
||||||
|
} from './queries';
|
||||||
|
|
||||||
|
// Re-export CrawlRotator types from canonical location
|
||||||
|
export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator';
|
||||||
132
backend/src/platforms/treez/queries.ts
Normal file
132
backend/src/platforms/treez/queries.ts
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
/**
|
||||||
|
* Treez High-Level Query Functions
|
||||||
|
*
|
||||||
|
* Wraps the low-level client methods with business logic
|
||||||
|
* for common operations like product fetching.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
startSession,
|
||||||
|
endSession,
|
||||||
|
fetchAllProducts,
|
||||||
|
buildMenuUrl,
|
||||||
|
TreezProductRaw,
|
||||||
|
TreezStoreInfo,
|
||||||
|
} from './client';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PRODUCT OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface FetchProductsResult {
|
||||||
|
store: TreezStoreInfo;
|
||||||
|
products: TreezProductRaw[];
|
||||||
|
totalCaptured: number;
|
||||||
|
scrollCount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch all products from a Treez store
|
||||||
|
*
|
||||||
|
* @param storeId - Treez store ID (slug like "best")
|
||||||
|
* @returns Products and store data captured from the page
|
||||||
|
*/
|
||||||
|
export async function fetchProductsByStoreId(storeId: string): Promise<FetchProductsResult> {
|
||||||
|
try {
|
||||||
|
await startSession(storeId);
|
||||||
|
|
||||||
|
const { products, storeInfo, scrollCount } = await fetchAllProducts(storeId);
|
||||||
|
|
||||||
|
return {
|
||||||
|
store: storeInfo,
|
||||||
|
products,
|
||||||
|
totalCaptured: products.length,
|
||||||
|
scrollCount,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await endSession();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch products from a Treez menu URL
|
||||||
|
* Extracts store ID from URL and fetches products
|
||||||
|
*
|
||||||
|
* @param menuUrl - Full Treez menu URL
|
||||||
|
* @returns Products and store data
|
||||||
|
*/
|
||||||
|
export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProductsResult> {
|
||||||
|
const storeId = extractStoreIdFromUrl(menuUrl);
|
||||||
|
if (!storeId) {
|
||||||
|
throw new Error(`Could not extract store ID from URL: ${menuUrl}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return fetchProductsByStoreId(storeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract store ID from a Treez URL
|
||||||
|
*
|
||||||
|
* Supports formats:
|
||||||
|
* - https://best.treez.io/onlinemenu/
|
||||||
|
* - https://shop.bestdispensary.com/ (resolves to best.treez.io)
|
||||||
|
*
|
||||||
|
* @param url - Treez menu URL
|
||||||
|
* @returns Store ID or null if not found
|
||||||
|
*/
|
||||||
|
export function extractStoreIdFromUrl(url: string): string | null {
|
||||||
|
// Pattern 1: {storeId}.treez.io
|
||||||
|
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/i);
|
||||||
|
if (treezMatch) {
|
||||||
|
return treezMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pattern 2: Custom domain - would need to follow redirect
|
||||||
|
// For now, return null and let the caller handle it
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate that a store ID exists and is accessible
|
||||||
|
*
|
||||||
|
* @param storeId - Treez store ID
|
||||||
|
* @returns True if store is accessible
|
||||||
|
*/
|
||||||
|
export async function validateStoreId(storeId: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await startSession(storeId);
|
||||||
|
|
||||||
|
const { page } = (await import('./client')).getCurrentSession()!;
|
||||||
|
const url = buildMenuUrl(storeId);
|
||||||
|
|
||||||
|
await page.goto(url, {
|
||||||
|
waitUntil: 'domcontentloaded',
|
||||||
|
timeout: 30000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Check if we got a valid page (not 404)
|
||||||
|
const title = await page.title();
|
||||||
|
const is404 = title.toLowerCase().includes('404') || title.toLowerCase().includes('not found');
|
||||||
|
|
||||||
|
return !is404;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
await endSession();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// UTILITY FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the direct Treez menu URL for a store
|
||||||
|
*/
|
||||||
|
export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||||
|
return buildMenuUrl(storeId, customerType);
|
||||||
|
}
|
||||||
@@ -27,3 +27,6 @@ export { handleStoreDiscoveryDutchie } from './store-discovery-dutchie';
|
|||||||
export { handleStoreDiscoveryJane } from './store-discovery-jane';
|
export { handleStoreDiscoveryJane } from './store-discovery-jane';
|
||||||
export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane';
|
export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane';
|
||||||
export { handleProductDiscoveryJane } from './product-discovery-jane';
|
export { handleProductDiscoveryJane } from './product-discovery-jane';
|
||||||
|
|
||||||
|
// Treez Platform Handlers
|
||||||
|
export { handleProductDiscoveryTreez } from './product-discovery-treez';
|
||||||
|
|||||||
@@ -126,6 +126,28 @@ export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise<T
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STEP 2b: Apply stored fingerprint (timezone, locale)
|
||||||
|
// CRITICAL: Must match the IP's geographic location
|
||||||
|
// ============================================================
|
||||||
|
if (ctx.fingerprint?.timezone) {
|
||||||
|
try {
|
||||||
|
const client = await page.target().createCDPSession();
|
||||||
|
await client.send('Emulation.setTimezoneOverride', { timezoneId: ctx.fingerprint.timezone });
|
||||||
|
console.log(`[ProductDiscoveryHTTP] Browser timezone set to: ${ctx.fingerprint.timezone}`);
|
||||||
|
} catch (tzErr: any) {
|
||||||
|
console.warn(`[ProductDiscoveryHTTP] Failed to set timezone: ${tzErr.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set locale to match proxy region (en-US for US proxies)
|
||||||
|
if (ctx.fingerprint?.locale) {
|
||||||
|
await page.setExtraHTTPHeaders({
|
||||||
|
'Accept-Language': `${ctx.fingerprint.locale},en;q=0.9`,
|
||||||
|
});
|
||||||
|
console.log(`[ProductDiscoveryHTTP] Accept-Language set to: ${ctx.fingerprint.locale}`);
|
||||||
|
}
|
||||||
|
|
||||||
await ctx.heartbeat();
|
await ctx.heartbeat();
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|||||||
172
backend/src/tasks/handlers/product-discovery-treez.ts
Normal file
172
backend/src/tasks/handlers/product-discovery-treez.ts
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
/**
|
||||||
|
* Treez Product Discovery Handler
|
||||||
|
*
|
||||||
|
* Fetches all products from a Treez store via Puppeteer + DOM scraping.
|
||||||
|
*
|
||||||
|
* Flow:
|
||||||
|
* 1. Load dispensary with platform_dispensary_id (store slug)
|
||||||
|
* 2. Navigate to menu URL, bypass age gate
|
||||||
|
* 3. Scroll to load all products (infinite scroll)
|
||||||
|
* 4. Extract products from DOM
|
||||||
|
* 5. Save raw payload to filesystem
|
||||||
|
* 6. Queue product_refresh task for normalization
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { TaskContext, TaskResult } from '../task-worker';
|
||||||
|
import {
|
||||||
|
setCrawlRotator,
|
||||||
|
fetchProductsByStoreId,
|
||||||
|
} from '../../platforms/treez';
|
||||||
|
import { saveRawPayload } from '../../utils/payload-storage';
|
||||||
|
import { taskService } from '../task-service';
|
||||||
|
|
||||||
|
export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<TaskResult> {
|
||||||
|
const { pool, task, crawlRotator } = ctx;
|
||||||
|
const dispensaryId = task.dispensary_id;
|
||||||
|
|
||||||
|
if (!dispensaryId) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: 'Missing dispensary_id in task',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[TreezProductDiscovery] Starting for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Load dispensary
|
||||||
|
const dispResult = await pool.query(
|
||||||
|
`SELECT id, name, menu_url, platform_dispensary_id, menu_type, platform
|
||||||
|
FROM dispensaries WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (dispResult.rows.length === 0) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `Dispensary ${dispensaryId} not found`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const dispensary = dispResult.rows[0];
|
||||||
|
|
||||||
|
if (!dispensary.platform_dispensary_id) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `Dispensary ${dispensaryId} has no platform_dispensary_id (Treez store ID)`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const storeId = dispensary.platform_dispensary_id;
|
||||||
|
console.log(`[TreezProductDiscovery] Fetching products for Treez store "${storeId}"`);
|
||||||
|
|
||||||
|
// Attach crawl rotator
|
||||||
|
if (crawlRotator) {
|
||||||
|
setCrawlRotator(crawlRotator);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch products via DOM scraping
|
||||||
|
const result = await fetchProductsByStoreId(storeId);
|
||||||
|
|
||||||
|
if (result.products.length === 0) {
|
||||||
|
console.warn(`[TreezProductDiscovery] No products captured for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
// Update dispensary with failure
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries
|
||||||
|
SET consecutive_failures = consecutive_failures + 1,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: 'No products captured from Treez menu page',
|
||||||
|
productCount: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`);
|
||||||
|
|
||||||
|
// Build payload for storage
|
||||||
|
const rawPayload = {
|
||||||
|
products: result.products, // Store the scraped product data
|
||||||
|
store: {
|
||||||
|
storeId: result.store.storeId,
|
||||||
|
name: result.store.name,
|
||||||
|
url: result.store.url,
|
||||||
|
},
|
||||||
|
capturedAt: new Date().toISOString(),
|
||||||
|
platform: 'treez',
|
||||||
|
dispensaryId,
|
||||||
|
scrollCount: result.scrollCount,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Save raw payload to filesystem (platform = 'treez')
|
||||||
|
const { id: payloadId, sizeBytes } = await saveRawPayload(
|
||||||
|
pool,
|
||||||
|
dispensaryId,
|
||||||
|
rawPayload,
|
||||||
|
null, // crawl_run_id
|
||||||
|
result.products.length,
|
||||||
|
'treez' // platform
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`[TreezProductDiscovery] Saved payload ${payloadId} (${Math.round(sizeBytes / 1024)}KB)`);
|
||||||
|
|
||||||
|
// Update dispensary stage and timestamps
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries
|
||||||
|
SET stage = 'hydrating',
|
||||||
|
last_fetch_at = NOW(),
|
||||||
|
product_count = $2,
|
||||||
|
consecutive_successes = consecutive_successes + 1,
|
||||||
|
consecutive_failures = 0,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[dispensaryId, result.products.length]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Queue product_refresh task for normalization
|
||||||
|
console.log(`[TreezProductDiscovery] Queuing product_refresh for payload ${payloadId}`);
|
||||||
|
await taskService.createTask({
|
||||||
|
role: 'product_refresh',
|
||||||
|
dispensary_id: dispensaryId,
|
||||||
|
platform: 'treez',
|
||||||
|
priority: task.priority || 0,
|
||||||
|
payload: { payload_id: payloadId },
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
productCount: result.products.length,
|
||||||
|
payloadId,
|
||||||
|
payloadSizeKB: Math.round(sizeBytes / 1024),
|
||||||
|
storeInfo: {
|
||||||
|
storeId: result.store.storeId,
|
||||||
|
name: result.store.name,
|
||||||
|
},
|
||||||
|
scrollCount: result.scrollCount,
|
||||||
|
queuedProductRefresh: true,
|
||||||
|
};
|
||||||
|
} catch (error: unknown) {
|
||||||
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||||
|
console.error(`[TreezProductDiscovery] Error:`, errorMessage);
|
||||||
|
|
||||||
|
// Update dispensary with failure
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries
|
||||||
|
SET consecutive_failures = consecutive_failures + 1,
|
||||||
|
stage = CASE WHEN consecutive_failures >= 2 THEN 'failing' ELSE stage END,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
).catch(() => {});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: errorMessage,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -24,6 +24,7 @@
|
|||||||
import { TaskContext, TaskResult } from '../task-worker';
|
import { TaskContext, TaskResult } from '../task-worker';
|
||||||
import { DutchieNormalizer } from '../../hydration/normalizers/dutchie';
|
import { DutchieNormalizer } from '../../hydration/normalizers/dutchie';
|
||||||
import { JaneNormalizer } from '../../hydration/normalizers/jane';
|
import { JaneNormalizer } from '../../hydration/normalizers/jane';
|
||||||
|
import { TreezNormalizer } from '../../hydration/normalizers/treez';
|
||||||
import { BaseNormalizer } from '../../hydration/normalizers/base';
|
import { BaseNormalizer } from '../../hydration/normalizers/base';
|
||||||
import {
|
import {
|
||||||
upsertStoreProducts,
|
upsertStoreProducts,
|
||||||
@@ -37,6 +38,7 @@ import { taskService } from '../task-service';
|
|||||||
const NORMALIZERS: Record<string, BaseNormalizer> = {
|
const NORMALIZERS: Record<string, BaseNormalizer> = {
|
||||||
dutchie: new DutchieNormalizer(),
|
dutchie: new DutchieNormalizer(),
|
||||||
jane: new JaneNormalizer(),
|
jane: new JaneNormalizer(),
|
||||||
|
treez: new TreezNormalizer(),
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -87,6 +87,9 @@ import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
|
|||||||
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
|
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
|
||||||
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
|
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
|
||||||
|
|
||||||
|
// Treez Platform Handlers
|
||||||
|
import { handleProductDiscoveryTreez } from './handlers/product-discovery-treez';
|
||||||
|
|
||||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||||
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
||||||
@@ -136,6 +139,14 @@ const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0
|
|||||||
// How long to wait (ms) when in backoff state before rechecking resources
|
// How long to wait (ms) when in backoff state before rechecking resources
|
||||||
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
|
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
|
||||||
|
|
||||||
|
export interface WorkerFingerprint {
|
||||||
|
timezone?: string;
|
||||||
|
city?: string;
|
||||||
|
state?: string;
|
||||||
|
ip?: string;
|
||||||
|
locale?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface TaskContext {
|
export interface TaskContext {
|
||||||
pool: Pool;
|
pool: Pool;
|
||||||
workerId: string;
|
workerId: string;
|
||||||
@@ -144,6 +155,8 @@ export interface TaskContext {
|
|||||||
crawlRotator?: CrawlRotator;
|
crawlRotator?: CrawlRotator;
|
||||||
/** Update the current step being executed (shown in dashboard) */
|
/** Update the current step being executed (shown in dashboard) */
|
||||||
updateStep: (step: string, detail?: string) => void;
|
updateStep: (step: string, detail?: string) => void;
|
||||||
|
/** Worker's stored fingerprint from preflight (timezone, locale, etc.) */
|
||||||
|
fingerprint?: WorkerFingerprint;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TaskResult {
|
export interface TaskResult {
|
||||||
@@ -201,6 +214,17 @@ function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ==========================================================================
|
||||||
|
// TREEZ PLATFORM ROUTING
|
||||||
|
// ==========================================================================
|
||||||
|
if (platform === 'treez') {
|
||||||
|
if (role === 'product_discovery') {
|
||||||
|
console.log(`[TaskWorker] Using Treez handler for product_discovery`);
|
||||||
|
return handleProductDiscoveryTreez;
|
||||||
|
}
|
||||||
|
// Treez uses shared product_refresh handler via normalizer registry
|
||||||
|
}
|
||||||
|
|
||||||
// ==========================================================================
|
// ==========================================================================
|
||||||
// DUTCHIE PLATFORM ROUTING (default)
|
// DUTCHIE PLATFORM ROUTING (default)
|
||||||
// ==========================================================================
|
// ==========================================================================
|
||||||
@@ -330,6 +354,8 @@ export class TaskWorker {
|
|||||||
private geoCity: string | null = null;
|
private geoCity: string | null = null;
|
||||||
private geoProxyUrl: string | null = null;
|
private geoProxyUrl: string | null = null;
|
||||||
private geoSessionStartedAt: Date | null = null;
|
private geoSessionStartedAt: Date | null = null;
|
||||||
|
private storedTimezone: string | null = null;
|
||||||
|
private storedFingerprint: WorkerFingerprint | null = null;
|
||||||
|
|
||||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||||
this.pool = getPool();
|
this.pool = getPool();
|
||||||
@@ -655,7 +681,22 @@ export class TaskWorker {
|
|||||||
|
|
||||||
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
|
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
|
||||||
if (this.preflightHttpResult?.proxyIp) {
|
if (this.preflightHttpResult?.proxyIp) {
|
||||||
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`);
|
const detectedTimezone = (this.preflightHttpResult as any).detectedTimezone;
|
||||||
|
const detectedLocation = (this.preflightHttpResult as any).detectedLocation;
|
||||||
|
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${detectedTimezone || 'unknown'}`);
|
||||||
|
|
||||||
|
// Store fingerprint for task execution - CRITICAL for anti-detect consistency
|
||||||
|
if (this.preflightHttpPassed) {
|
||||||
|
this.storedTimezone = detectedTimezone || null;
|
||||||
|
this.storedFingerprint = {
|
||||||
|
timezone: detectedTimezone,
|
||||||
|
city: detectedLocation?.city,
|
||||||
|
state: detectedLocation?.region,
|
||||||
|
ip: this.preflightHttpResult.proxyIp,
|
||||||
|
locale: 'en-US', // US proxies use English
|
||||||
|
};
|
||||||
|
console.log(`[TaskWorker] Stored fingerprint: ${JSON.stringify(this.storedFingerprint)}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
// Non-fatal - worker can still function
|
// Non-fatal - worker can still function
|
||||||
@@ -1349,7 +1390,7 @@ export class TaskWorker {
|
|||||||
throw new Error(`No handler registered for role: ${task.role}`);
|
throw new Error(`No handler registered for role: ${task.role}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create context with step tracking
|
// Create context with step tracking and fingerprint
|
||||||
const ctx: TaskContext = {
|
const ctx: TaskContext = {
|
||||||
pool: this.pool,
|
pool: this.pool,
|
||||||
workerId: this.workerId,
|
workerId: this.workerId,
|
||||||
@@ -1361,6 +1402,8 @@ export class TaskWorker {
|
|||||||
updateStep: (step: string, detail?: string) => {
|
updateStep: (step: string, detail?: string) => {
|
||||||
this.updateTaskStep(task.id, step, detail);
|
this.updateTaskStep(task.id, step, detail);
|
||||||
},
|
},
|
||||||
|
// Pass stored fingerprint for browser configuration
|
||||||
|
fingerprint: this.storedFingerprint || undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Initialize step tracking for this task
|
// Initialize step tracking for this task
|
||||||
|
|||||||
4
cannaiq/dist/index.html
vendored
4
cannaiq/dist/index.html
vendored
@@ -7,8 +7,8 @@
|
|||||||
<title>CannaIQ - Cannabis Menu Intelligence Platform</title>
|
<title>CannaIQ - Cannabis Menu Intelligence Platform</title>
|
||||||
<meta name="description" content="CannaIQ provides real-time cannabis dispensary menu data, product tracking, and analytics for dispensaries across Arizona." />
|
<meta name="description" content="CannaIQ provides real-time cannabis dispensary menu data, product tracking, and analytics for dispensaries across Arizona." />
|
||||||
<meta name="keywords" content="cannabis, dispensary, menu, products, analytics, Arizona" />
|
<meta name="keywords" content="cannabis, dispensary, menu, products, analytics, Arizona" />
|
||||||
<script type="module" crossorigin src="/assets/index-BkhbQgZG.js"></script>
|
<script type="module" crossorigin src="/assets/index-Cgew9i_-.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-DcW_XTOx.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-yJj_6wf9.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="root"></div>
|
<div id="root"></div>
|
||||||
|
|||||||
@@ -1421,7 +1421,10 @@ export default function TasksDashboard() {
|
|||||||
{schedule.state_code}
|
{schedule.state_code}
|
||||||
</span>
|
</span>
|
||||||
) : (
|
) : (
|
||||||
<span className="text-gray-400">-</span>
|
<span className="inline-flex items-center gap-1 px-2 py-0.5 bg-gray-100 text-gray-600 rounded font-medium">
|
||||||
|
<Globe className="w-3 h-3" />
|
||||||
|
All
|
||||||
|
</span>
|
||||||
)}
|
)}
|
||||||
</td>
|
</td>
|
||||||
<td className="px-4 py-3 text-sm">
|
<td className="px-4 py-3 text-sm">
|
||||||
|
|||||||
Reference in New Issue
Block a user