The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
184 lines
6.2 KiB
TypeScript
184 lines
6.2 KiB
TypeScript
import { chromium as playwright } from 'playwright-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
|
|
playwright.use(StealthPlugin());
|
|
|
|
type Product = {
|
|
name: string;
|
|
brand?: string;
|
|
price?: number;
|
|
size?: string;
|
|
category?: string;
|
|
url?: string;
|
|
imageUrl?: string;
|
|
inStock: boolean;
|
|
};
|
|
|
|
type BrandGroup = {
|
|
brand: string;
|
|
products: Product[];
|
|
};
|
|
|
|
const TARGET_URL = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
|
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
|
|
const JSON_PATH = path.join(OUTPUT_DIR, 'inventory-by-brand.json');
|
|
|
|
async function ensureDirs(): Promise<void> {
|
|
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
|
}
|
|
|
|
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
|
|
const start = Date.now();
|
|
while (Date.now() - start < maxWaitMs) {
|
|
const title = await page.title().catch(() => '');
|
|
const content = await page.content().catch(() => '');
|
|
const challenge =
|
|
title.includes('Attention Required') ||
|
|
title.includes('Just a moment') ||
|
|
content.includes('challenge-platform') ||
|
|
content.includes('cf-challenge');
|
|
if (!challenge) return true;
|
|
await page.waitForTimeout(2000);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
async function loadAllProducts(page: any): Promise<void> {
|
|
const maxScrolls = 40;
|
|
for (let i = 0; i < maxScrolls; i++) {
|
|
const beforeCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
|
|
await page.mouse.wheel(0, 1400);
|
|
await page.waitForTimeout(900);
|
|
const afterCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
|
|
if (afterCount <= beforeCount) break;
|
|
}
|
|
await page.evaluate(() => window.scrollTo({ top: 0 }));
|
|
}
|
|
|
|
async function extractProducts(page: any): Promise<Product[]> {
|
|
const script = `
|
|
(() => {
|
|
function parsePrice(text) {
|
|
if (!text) return undefined;
|
|
const match = text.match(/\\$?(\\d+(?:\\.\\d{1,2})?)/);
|
|
return match ? parseFloat(match[1]) : undefined;
|
|
}
|
|
|
|
function pickImage(card) {
|
|
const imgEl =
|
|
card.querySelector('img[src^="http"]') ||
|
|
card.querySelector('source[srcset]');
|
|
if (imgEl && imgEl.src && imgEl.src.startsWith('http')) {
|
|
return imgEl.src;
|
|
}
|
|
if (imgEl && imgEl.srcset) {
|
|
const first = imgEl.srcset.split(',')[0]?.trim().split(' ')[0];
|
|
if (first && first.startsWith('http')) return first;
|
|
}
|
|
const dataSrc = card.querySelector('img[data-src]')?.getAttribute('data-src');
|
|
if (dataSrc && dataSrc.startsWith('http')) return dataSrc;
|
|
return undefined;
|
|
}
|
|
|
|
const cards = Array.from(
|
|
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
|
|
);
|
|
|
|
return cards
|
|
.map((card) => {
|
|
const name =
|
|
card.querySelector('[data-testid="product-card-name"]')?.innerText?.trim() ||
|
|
card.querySelector('[data-testid="product-name"]')?.innerText?.trim() ||
|
|
card.querySelector('h3, h4')?.innerText?.trim() ||
|
|
(card.textContent || '').split('\\n').map((t) => t.trim()).find((t) => t.length > 3) ||
|
|
'';
|
|
|
|
const brand =
|
|
card.querySelector('[data-testid="product-card-brand"]')?.innerText?.trim() ||
|
|
card.querySelector('[data-testid="product-brand"]')?.innerText?.trim() ||
|
|
undefined;
|
|
|
|
const priceText =
|
|
card.querySelector('[data-testid="product-card-price"]')?.innerText ||
|
|
card.textContent ||
|
|
'';
|
|
const price = parsePrice(priceText);
|
|
|
|
const size =
|
|
card.querySelector('[data-testid*="size"]')?.innerText?.trim() ||
|
|
card.querySelector('[data-testid*="weight"]')?.innerText?.trim() ||
|
|
undefined;
|
|
|
|
const category =
|
|
card.querySelector('[data-testid*="category"]')?.innerText?.trim() ||
|
|
undefined;
|
|
|
|
const link = card.querySelector('a[href*="/product/"]');
|
|
const url = link?.href;
|
|
|
|
const imageUrl = pickImage(card);
|
|
|
|
const cardText = (card.textContent || '').toLowerCase();
|
|
const inStock = !(cardText.includes('sold out') || cardText.includes('out of stock'));
|
|
|
|
return { name, brand, price, size, category, url, imageUrl, inStock };
|
|
})
|
|
.filter((p) => p.name);
|
|
})();
|
|
`;
|
|
|
|
return page.evaluate(script);
|
|
}
|
|
|
|
function groupByBrand(products: Product[]): BrandGroup[] {
|
|
const map = new Map<string, Product[]>();
|
|
for (const p of products) {
|
|
const key = p.brand || 'Unknown';
|
|
if (!map.has(key)) map.set(key, []);
|
|
map.get(key)!.push(p);
|
|
}
|
|
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
|
|
}
|
|
|
|
async function main() {
|
|
await ensureDirs();
|
|
|
|
const browser = await playwright.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
|
});
|
|
|
|
const page = await browser.newPage({
|
|
viewport: { width: 1300, height: 900 },
|
|
userAgent:
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
|
|
try {
|
|
console.log(`Navigating to ${TARGET_URL}...`);
|
|
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
|
|
|
|
const cfOk = await waitForCloudflare(page, 60000);
|
|
if (!cfOk) throw new Error('Cloudflare challenge not passed in time');
|
|
|
|
await page.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
|
|
|
|
await loadAllProducts(page);
|
|
const products = await extractProducts(page);
|
|
const grouped = groupByBrand(products);
|
|
|
|
await fs.writeFile(JSON_PATH, JSON.stringify(grouped, null, 2));
|
|
console.log(`Found ${products.length} products across ${grouped.length} brands`);
|
|
console.log(`Saved grouped inventory to ${JSON_PATH}`);
|
|
} catch (err) {
|
|
console.error('Inventory scrape failed:', err);
|
|
process.exitCode = 1;
|
|
} finally {
|
|
await page.context().browser()?.close();
|
|
}
|
|
}
|
|
|
|
main();
|