- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
181 lines
5.2 KiB
JavaScript
181 lines
5.2 KiB
JavaScript
/**
|
|
* Stealth Browser Payload Capture - Direct GraphQL Injection
|
|
*
|
|
* Uses the browser session to make GraphQL requests that look organic.
|
|
* Adds proper headers matching what Dutchie's frontend sends.
|
|
*/
|
|
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
const fs = require('fs');
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
async function capturePayload(config) {
|
|
const {
|
|
dispensaryId = null,
|
|
platformId,
|
|
cName,
|
|
outputPath = `/tmp/payload_${cName}_${Date.now()}.json`,
|
|
} = config;
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Establish session by visiting the embedded menu
|
|
const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`;
|
|
console.log(`[Capture] Establishing session at ${embedUrl}...`);
|
|
|
|
await page.goto(embedUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log('[Capture] Session established, fetching ALL products...');
|
|
|
|
// Fetch all products using GET requests with proper headers
|
|
const result = await page.evaluate(async (platformId, cName) => {
|
|
const allProducts = [];
|
|
const logs = [];
|
|
let pageNum = 0;
|
|
const perPage = 100;
|
|
let totalCount = 0;
|
|
const sessionId = 'browser-session-' + Date.now();
|
|
|
|
try {
|
|
while (pageNum < 30) { // Max 30 pages = 3000 products
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId: platformId,
|
|
pricingType: 'rec',
|
|
Status: 'Active', // 'Active' for in-stock products per CLAUDE.md
|
|
types: [],
|
|
useCache: true,
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page: pageNum,
|
|
perPage: perPage,
|
|
};
|
|
|
|
const extensions = {
|
|
persistedQuery: {
|
|
version: 1,
|
|
sha256Hash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'
|
|
}
|
|
};
|
|
|
|
// Build GET URL like the browser does
|
|
const qs = new URLSearchParams({
|
|
operationName: 'FilteredProducts',
|
|
variables: JSON.stringify(variables),
|
|
extensions: JSON.stringify(extensions)
|
|
});
|
|
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
|
|
|
const response = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Accept': 'application/json',
|
|
'content-type': 'application/json',
|
|
'x-dutchie-session': sessionId,
|
|
'apollographql-client-name': 'Marketplace (production)',
|
|
},
|
|
credentials: 'include'
|
|
});
|
|
|
|
logs.push(`Page ${pageNum}: HTTP ${response.status}`);
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text();
|
|
logs.push(`HTTP error: ${response.status} - ${text.slice(0, 200)}`);
|
|
break;
|
|
}
|
|
|
|
const json = await response.json();
|
|
|
|
if (json.errors) {
|
|
logs.push(`GraphQL error: ${JSON.stringify(json.errors).slice(0, 200)}`);
|
|
break;
|
|
}
|
|
|
|
const data = json?.data?.filteredProducts;
|
|
if (!data || !data.products) {
|
|
logs.push('No products in response');
|
|
break;
|
|
}
|
|
|
|
const products = data.products;
|
|
allProducts.push(...products);
|
|
|
|
if (pageNum === 0) {
|
|
totalCount = data.queryInfo?.totalCount || 0;
|
|
logs.push(`Total reported: ${totalCount}`);
|
|
}
|
|
|
|
logs.push(`Got ${products.length} products (total: ${allProducts.length}/${totalCount})`);
|
|
|
|
if (allProducts.length >= totalCount || products.length < perPage) {
|
|
break;
|
|
}
|
|
|
|
pageNum++;
|
|
|
|
// Small delay between pages to be polite
|
|
await new Promise(r => setTimeout(r, 200));
|
|
}
|
|
} catch (err) {
|
|
logs.push(`Error: ${err.message}`);
|
|
}
|
|
|
|
return { products: allProducts, totalCount, logs };
|
|
}, platformId, cName);
|
|
|
|
await browser.close();
|
|
|
|
// Print logs from browser context
|
|
result.logs.forEach(log => console.log(`[Browser] ${log}`));
|
|
|
|
console.log(`[Capture] Got ${result.products.length} products (API reported ${result.totalCount})`);
|
|
|
|
const payload = {
|
|
dispensaryId: dispensaryId,
|
|
platformId: platformId,
|
|
cName,
|
|
fetchedAt: new Date().toISOString(),
|
|
productCount: result.products.length,
|
|
products: result.products,
|
|
};
|
|
|
|
fs.writeFileSync(outputPath, JSON.stringify(payload, null, 2));
|
|
|
|
console.log(`\n=== Capture Complete ===`);
|
|
console.log(`Total products: ${result.products.length}`);
|
|
console.log(`Saved to: ${outputPath}`);
|
|
console.log(`File size: ${(fs.statSync(outputPath).size / 1024).toFixed(1)} KB`);
|
|
|
|
return payload;
|
|
}
|
|
|
|
// Run
|
|
(async () => {
|
|
const payload = await capturePayload({
|
|
cName: 'AZ-Deeply-Rooted',
|
|
platformId: '6405ef617056e8014d79101b',
|
|
});
|
|
|
|
if (payload.products.length > 0) {
|
|
const sample = payload.products[0];
|
|
console.log(`\nSample: ${sample.Name || sample.name} - ${sample.brand?.name || sample.brandName}`);
|
|
}
|
|
})().catch(console.error);
|