The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
215 lines
6.8 KiB
TypeScript
215 lines
6.8 KiB
TypeScript
import { chromium as playwright } from 'playwright-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
|
|
playwright.use(StealthPlugin());
|
|
|
|
type ProductVariant = {
|
|
label: string;
|
|
price?: number;
|
|
inventory?: string;
|
|
};
|
|
|
|
type ProductData = {
|
|
name: string;
|
|
brand?: string;
|
|
price?: number;
|
|
description?: string;
|
|
thc?: string;
|
|
cbd?: string;
|
|
category?: string;
|
|
variants?: ProductVariant[];
|
|
images: string[];
|
|
productUrl: string;
|
|
};
|
|
|
|
const PRODUCT_URL =
|
|
'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas';
|
|
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product');
|
|
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
|
|
const JSON_PATH = path.join(OUTPUT_DIR, 'product.json');
|
|
|
|
async function ensureDirs() {
|
|
await fs.mkdir(IMAGE_DIR, { recursive: true });
|
|
}
|
|
|
|
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
|
|
const start = Date.now();
|
|
while (Date.now() - start < maxWaitMs) {
|
|
const title = await page.title().catch(() => '');
|
|
const content = await page.content().catch(() => '');
|
|
const challenge =
|
|
title.includes('Attention Required') ||
|
|
title.includes('Just a moment') ||
|
|
content.includes('challenge-platform') ||
|
|
content.includes('cf-challenge');
|
|
if (!challenge) return true;
|
|
await page.waitForTimeout(2000);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
async function extractProduct(page: any): Promise<ProductData> {
|
|
return page.evaluate(() => {
|
|
const pickText = (selectorList: string[]): string | undefined => {
|
|
for (const sel of selectorList) {
|
|
const el = document.querySelector(sel) as HTMLElement | null;
|
|
const txt = el?.innerText?.trim();
|
|
if (txt) return txt;
|
|
}
|
|
return undefined;
|
|
};
|
|
|
|
const pickAllTexts = (selector: string): string[] =>
|
|
Array.from(document.querySelectorAll(selector))
|
|
.map(el => (el as HTMLElement).innerText?.trim())
|
|
.filter(Boolean) as string[];
|
|
|
|
const parsePrice = (text?: string | null): number | undefined => {
|
|
if (!text) return undefined;
|
|
const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/);
|
|
return match ? parseFloat(match[1]) : undefined;
|
|
};
|
|
|
|
const name =
|
|
pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || '';
|
|
const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']);
|
|
const priceText =
|
|
pickText([
|
|
'[data-testid="product-price"]',
|
|
'[data-testid*="price"]',
|
|
'[class*="Price"]'
|
|
]) || '';
|
|
const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']);
|
|
|
|
const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]');
|
|
const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined;
|
|
const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined;
|
|
|
|
const category =
|
|
pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined;
|
|
|
|
const variantEls = Array.from(
|
|
document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]')
|
|
);
|
|
const variants = variantEls.map(el => {
|
|
const label =
|
|
(el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() ||
|
|
el.textContent?.trim() ||
|
|
'';
|
|
const price = parsePrice(el.textContent || undefined);
|
|
return { label, price };
|
|
}).filter(v => v.label);
|
|
|
|
const imageUrls = Array.from(
|
|
document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]')
|
|
).map(el => {
|
|
if (el instanceof HTMLImageElement) return el.src;
|
|
const srcset = (el as HTMLSourceElement).srcset || '';
|
|
return srcset.split(',')[0]?.trim().split(' ')[0];
|
|
}).filter((u): u is string => !!u);
|
|
|
|
return {
|
|
name,
|
|
brand,
|
|
price: parsePrice(priceText),
|
|
description,
|
|
thc,
|
|
cbd,
|
|
category,
|
|
variants,
|
|
images: Array.from(new Set(imageUrls)),
|
|
productUrl: window.location.href,
|
|
};
|
|
});
|
|
}
|
|
|
|
function safeFileName(base: string, ext: string): string {
|
|
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`;
|
|
}
|
|
|
|
async function downloadImages(urls: string[]): Promise<string[]> {
|
|
const saved: string[] = [];
|
|
for (const url of urls) {
|
|
try {
|
|
const res = await fetch(url);
|
|
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
|
const buf = Buffer.from(await res.arrayBuffer());
|
|
const contentType = res.headers.get('content-type') || '';
|
|
const urlExt = path.extname(new URL(url).pathname).replace('.', '');
|
|
const ext =
|
|
urlExt ||
|
|
(contentType.includes('png')
|
|
? 'png'
|
|
: contentType.includes('jpeg')
|
|
? 'jpg'
|
|
: contentType.includes('webp')
|
|
? 'webp'
|
|
: 'bin');
|
|
const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext);
|
|
const filePath = path.join(IMAGE_DIR, fileName);
|
|
await fs.writeFile(filePath, buf);
|
|
saved.push(filePath);
|
|
} catch (err) {
|
|
console.warn(`Failed to download image ${url}:`, err);
|
|
}
|
|
}
|
|
return saved;
|
|
}
|
|
|
|
async function main() {
|
|
await ensureDirs();
|
|
|
|
const browser = await playwright.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-blink-features=AutomationControlled',
|
|
],
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1280, height: 900 },
|
|
userAgent:
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
console.log(`Navigating to product page...`);
|
|
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
|
|
|
|
const cfOk = await waitForCloudflare(page, 60000);
|
|
if (!cfOk) {
|
|
throw new Error('Cloudflare challenge not passed in time');
|
|
}
|
|
|
|
await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined);
|
|
await page.waitForTimeout(2000);
|
|
|
|
const product = await extractProduct(page);
|
|
console.log('Extracted product:');
|
|
console.log(product);
|
|
|
|
const imagePaths = await downloadImages(product.images);
|
|
const finalProduct = { ...product, imagePaths };
|
|
|
|
await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2));
|
|
|
|
console.log(`Saved product JSON to ${JSON_PATH}`);
|
|
if (imagePaths.length) {
|
|
console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`);
|
|
}
|
|
} catch (err) {
|
|
console.error('Failed to scrape product:', err);
|
|
process.exitCode = 1;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
main();
|