fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -0,0 +1,214 @@
import { chromium as playwright } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs/promises';
import path from 'path';
playwright.use(StealthPlugin());
type ProductVariant = {
label: string;
price?: number;
inventory?: string;
};
type ProductData = {
name: string;
brand?: string;
price?: number;
description?: string;
thc?: string;
cbd?: string;
category?: string;
variants?: ProductVariant[];
images: string[];
productUrl: string;
};
const PRODUCT_URL =
'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product');
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
const JSON_PATH = path.join(OUTPUT_DIR, 'product.json');
async function ensureDirs() {
await fs.mkdir(IMAGE_DIR, { recursive: true });
}
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < maxWaitMs) {
const title = await page.title().catch(() => '');
const content = await page.content().catch(() => '');
const challenge =
title.includes('Attention Required') ||
title.includes('Just a moment') ||
content.includes('challenge-platform') ||
content.includes('cf-challenge');
if (!challenge) return true;
await page.waitForTimeout(2000);
}
return false;
}
async function extractProduct(page: any): Promise<ProductData> {
return page.evaluate(() => {
const pickText = (selectorList: string[]): string | undefined => {
for (const sel of selectorList) {
const el = document.querySelector(sel) as HTMLElement | null;
const txt = el?.innerText?.trim();
if (txt) return txt;
}
return undefined;
};
const pickAllTexts = (selector: string): string[] =>
Array.from(document.querySelectorAll(selector))
.map(el => (el as HTMLElement).innerText?.trim())
.filter(Boolean) as string[];
const parsePrice = (text?: string | null): number | undefined => {
if (!text) return undefined;
const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/);
return match ? parseFloat(match[1]) : undefined;
};
const name =
pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || '';
const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']);
const priceText =
pickText([
'[data-testid="product-price"]',
'[data-testid*="price"]',
'[class*="Price"]'
]) || '';
const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']);
const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]');
const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined;
const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined;
const category =
pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined;
const variantEls = Array.from(
document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]')
);
const variants = variantEls.map(el => {
const label =
(el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() ||
el.textContent?.trim() ||
'';
const price = parsePrice(el.textContent || undefined);
return { label, price };
}).filter(v => v.label);
const imageUrls = Array.from(
document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]')
).map(el => {
if (el instanceof HTMLImageElement) return el.src;
const srcset = (el as HTMLSourceElement).srcset || '';
return srcset.split(',')[0]?.trim().split(' ')[0];
}).filter((u): u is string => !!u);
return {
name,
brand,
price: parsePrice(priceText),
description,
thc,
cbd,
category,
variants,
images: Array.from(new Set(imageUrls)),
productUrl: window.location.href,
};
});
}
function safeFileName(base: string, ext: string): string {
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`;
}
async function downloadImages(urls: string[]): Promise<string[]> {
const saved: string[] = [];
for (const url of urls) {
try {
const res = await fetch(url);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const buf = Buffer.from(await res.arrayBuffer());
const contentType = res.headers.get('content-type') || '';
const urlExt = path.extname(new URL(url).pathname).replace('.', '');
const ext =
urlExt ||
(contentType.includes('png')
? 'png'
: contentType.includes('jpeg')
? 'jpg'
: contentType.includes('webp')
? 'webp'
: 'bin');
const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext);
const filePath = path.join(IMAGE_DIR, fileName);
await fs.writeFile(filePath, buf);
saved.push(filePath);
} catch (err) {
console.warn(`Failed to download image ${url}:`, err);
}
}
return saved;
}
async function main() {
await ensureDirs();
const browser = await playwright.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
],
});
const context = await browser.newContext({
viewport: { width: 1280, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
console.log(`Navigating to product page...`);
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
const cfOk = await waitForCloudflare(page, 60000);
if (!cfOk) {
throw new Error('Cloudflare challenge not passed in time');
}
await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined);
await page.waitForTimeout(2000);
const product = await extractProduct(page);
console.log('Extracted product:');
console.log(product);
const imagePaths = await downloadImages(product.images);
const finalProduct = { ...product, imagePaths };
await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2));
console.log(`Saved product JSON to ${JSON_PATH}`);
if (imagePaths.length) {
console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`);
}
} catch (err) {
console.error('Failed to scrape product:', err);
process.exitCode = 1;
} finally {
await browser.close();
}
}
main();

View File

@@ -0,0 +1,227 @@
import fs from 'fs/promises';
import path from 'path';
import { chromium } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
chromium.use(StealthPlugin());
type Option = {
option?: string | null;
price?: number | null;
quantity?: number | null;
kioskQuantity?: number | null;
};
type Product = {
id: string;
slug?: string;
name: string;
brand?: string;
type?: string;
category?: string;
strainType?: string | null;
status?: string | null;
price?: number | null;
specialPrice?: number | null;
image?: string | null;
inStock: boolean;
options: Option[];
raw?: any;
};
const DISPENSARY_SLUG = 'AZ-Deeply-Rooted';
const DISPENSARY_ID = '6405ef617056e8014d79101b';
const HASH_FILTERED_PRODUCTS = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
const OUTPUT_FILE = path.join(OUTPUT_DIR, 'graphql-products.json');
async function ensureOutputDir() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
}
async function fetchAllProducts(): Promise<Product[]> {
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const context = await browser.newContext({
viewport: { width: 1300, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.7390.37 Safari/537.36',
});
const page = await context.newPage();
await page.goto(`https://dutchie.com/embedded-menu/${DISPENSARY_SLUG}`, {
waitUntil: 'domcontentloaded',
timeout: 90000,
});
await page.waitForTimeout(3000);
const products: any[] = await page.evaluate(
async ({ dispensaryId, hash }) => {
const sessionRaw = localStorage.getItem('dutchie-session');
const session = sessionRaw ? sessionRaw.replace(/^\"|\"$/g, '') : '';
const all: any[] = [];
const perPage = 100;
for (let pageIdx = 0; pageIdx < 40; pageIdx++) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId,
pricingType: 'rec',
Status: 'Active', // set to null to try to include inactive if exposed
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: pageIdx,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({
persistedQuery: { version: 1, sha256Hash: hash },
}),
});
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
const res = await fetch(url, {
headers: {
'apollographql-client-name': 'Marketplace (production)',
'x-dutchie-session': session,
'content-type': 'application/json',
},
credentials: 'include',
});
if (!res.ok) {
console.warn(`Request failed ${res.status} on page ${pageIdx}`);
break;
}
const json = await res.json();
const chunk = json?.data?.filteredProducts?.products || [];
all.push(...chunk);
if (chunk.length < perPage) break;
}
return all;
},
{ dispensaryId: DISPENSARY_ID, hash: HASH_FILTERED_PRODUCTS }
);
await browser.close();
return normalizeProducts(products);
}
function normalizeProducts(items: any[]): Product[] {
return items.map((p) => {
const options: Option[] =
p?.POSMetaData?.children?.map((child: any) => ({
option: child.option ?? null,
price:
child.recPrice ??
child.price ??
child.medPrice ??
null,
quantity:
child.quantity ??
child.quantityAvailable ??
null,
kioskQuantity: child.kioskQuantityAvailable ?? null,
})) || [];
const basePrice =
(p.recSpecialPrices && p.recSpecialPrices[0]) ??
(p.recPrices && p.recPrices[0]) ??
(p.Prices && p.Prices[0]) ??
null;
const image =
p.Image ||
(p.images && p.images.find((img: any) => img.active)?.url) ||
null;
const inStock =
options.some(
(o) =>
(o.quantity ?? 0) > 0 ||
(o.kioskQuantity ?? 0) > 0
) ||
!p.isBelowThreshold;
return {
id: p.id || p._id,
slug: p.cName,
name: p.Name,
brand: p.brandName || p.brand?.name,
type: p.type,
category: p.subcategory,
strainType: p.strainType,
status: p.Status,
price: basePrice,
specialPrice:
(p.recSpecialPrices && p.recSpecialPrices[0]) ||
(p.medicalSpecialPrices && p.medicalSpecialPrices[0]) ||
null,
image,
inStock,
options,
raw: undefined,
};
});
}
function summarize(products: Product[]) {
const total = products.length;
const inStock = products.filter((p) => p.inStock).length;
const outOfStock = total - inStock;
const byBrand = new Map<string, number>();
for (const p of products) {
const key = (p.brand || 'Unknown').trim();
byBrand.set(key, (byBrand.get(key) || 0) + 1);
}
const topBrands = Array.from(byBrand.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10);
return { total, inStock, outOfStock, topBrands };
}
function formatSample(products: Product[], n = 5) {
return products.slice(0, n).map((p) => ({
name: p.name,
brand: p.brand,
price: p.price,
specialPrice: p.specialPrice,
inStock: p.inStock,
options: p.options,
}));
}
async function main() {
await ensureOutputDir();
const products = await fetchAllProducts();
await fs.writeFile(OUTPUT_FILE, JSON.stringify(products, null, 2));
const summary = summarize(products);
console.log(`Saved ${products.length} products to ${OUTPUT_FILE}`);
console.log(`In stock: ${summary.inStock} | Out of stock: ${summary.outOfStock}`);
console.log('Top brands:', summary.topBrands);
console.log('Sample:', JSON.stringify(formatSample(products, 5), null, 2));
}
main().catch((err) => {
console.error('GraphQL scrape failed:', err);
process.exit(1);
});

View File

@@ -0,0 +1,90 @@
const fs = require('fs');
const path = require('path');
const INPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'inventory-by-brand.json');
const OUTPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'cleaned-inventory.json');
function extractPrice(text, fallback) {
const prices = Array.from(text.matchAll(/\$([0-9]+(?:\.[0-9]{2})?)/g)).map((m) => parseFloat(m[1]));
if (prices.length > 0) {
return Math.min(...prices);
}
return fallback;
}
function cleanBrandAndName(rawName, rawBrand) {
const parts = rawName.split('…').map((p) => p.trim()).filter(Boolean);
const name = parts[0] || rawName.trim();
const inferredBrand = parts[1]?.replace(/[^a-z0-9\s\-\&']/gi, ' ').replace(/\s+/g, ' ').trim();
const brand = normalizeBrand((rawBrand || inferredBrand || 'Unknown').trim());
return { name, brand };
}
function cleanProduct(p) {
const { name, brand } = cleanBrandAndName(p.name, p.brand);
const price = extractPrice(p.name, p.price);
return {
name,
brand: brand || 'Unknown',
price,
size: p.size,
category: p.category,
url: p.url,
imageUrl: p.imageUrl,
inStock: p.inStock !== false,
};
}
function dedupe(products) {
const seen = new Map();
for (const p of products) {
const key = (p.url || `${p.name.toLowerCase()}|${p.brand.toLowerCase()}`).trim();
if (!seen.has(key)) {
seen.set(key, p);
}
}
return Array.from(seen.values());
}
function groupByBrand(products) {
const map = new Map();
for (const p of products) {
const key = p.brand || 'Unknown';
if (!map.has(key)) map.set(key, []);
map.get(key).push(p);
}
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
}
function normalizeBrand(brand) {
const replacements = {
'Gr n': 'Gron',
};
return replacements[brand] || brand;
}
function main() {
const raw = JSON.parse(fs.readFileSync(INPUT, 'utf8'));
const flattened = [];
for (const group of raw) {
for (const p of group.products) {
flattened.push(cleanProduct(p));
}
}
const unique = dedupe(flattened);
const grouped = groupByBrand(unique);
fs.writeFileSync(OUTPUT, JSON.stringify(grouped, null, 2));
const total = unique.length;
const outOfStock = unique.filter((p) => !p.inStock).length;
console.log(`Cleaned products: ${total}`);
console.log(`Out of stock: ${outOfStock}`);
console.log(`Brands: ${grouped.length}`);
console.log(`Saved to ${OUTPUT}`);
}
main();

View File

@@ -0,0 +1,114 @@
import fs from 'fs/promises';
import path from 'path';
type RawProduct = {
name: string;
brand?: string;
price?: number;
size?: string;
category?: string;
url?: string;
imageUrl?: string;
inStock?: boolean;
};
type BrandGroup = {
brand: string;
products: CleanProduct[];
};
type CleanProduct = {
name: string;
brand: string;
price?: number;
size?: string;
category?: string;
url?: string;
imageUrl?: string;
inStock: boolean;
};
const INPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'inventory-by-brand.json');
const OUTPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'cleaned-inventory.json');
function extractPrice(text: string, fallback?: number): number | undefined {
const prices = [...text.matchAll(/\$([0-9]+(?:\.[0-9]{2})?)/g)].map((m) => parseFloat(m[1]));
if (prices.length > 0) {
// Use the lowest price (usually the sale price)
return Math.min(...prices);
}
return fallback;
}
function cleanBrandAndName(rawName: string, rawBrand?: string): { name: string; brand: string } {
const parts = rawName.split('…').map((p) => p.trim()).filter(Boolean);
const name = parts[0] || rawName.trim();
const inferredBrand = parts[1]?.replace(/[^a-z0-9\s\-\&']/gi, ' ').replace(/\s+/g, ' ').trim();
const brand = (rawBrand || inferredBrand || 'Unknown').trim();
return { name, brand };
}
function cleanProduct(p: RawProduct): CleanProduct {
const { name, brand } = cleanBrandAndName(p.name, p.brand);
const price = extractPrice(p.name, p.price);
return {
name,
brand: brand || 'Unknown',
price,
size: p.size,
category: p.category,
url: p.url,
imageUrl: p.imageUrl,
inStock: p.inStock !== false,
};
}
function dedupe(products: CleanProduct[]): CleanProduct[] {
const seen = new Map<string, CleanProduct>();
for (const p of products) {
const key = (p.url || `${p.name.toLowerCase()}|${p.brand.toLowerCase()}`).trim();
if (!seen.has(key)) {
seen.set(key, p);
}
}
return Array.from(seen.values());
}
function groupByBrand(products: CleanProduct[]): BrandGroup[] {
const map = new Map<string, CleanProduct[]>();
for (const p of products) {
const key = p.brand || 'Unknown';
if (!map.has(key)) map.set(key, []);
map.get(key)!.push(p);
}
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
}
async function main() {
const raw = JSON.parse(await fs.readFile(INPUT, 'utf8')) as { brand: string; products: RawProduct[] }[];
const flattened: CleanProduct[] = [];
for (const group of raw) {
for (const p of group.products) {
flattened.push(cleanProduct(p));
}
}
const unique = dedupe(flattened);
const grouped = groupByBrand(unique);
await fs.writeFile(OUTPUT, JSON.stringify(grouped, null, 2));
const total = unique.length;
const outOfStock = unique.filter((p) => !p.inStock).length;
console.log(`Cleaned products: ${total}`);
console.log(`Out of stock: ${outOfStock}`);
console.log(`Brands: ${grouped.length}`);
console.log(`Saved to ${OUTPUT}`);
}
main().catch((err) => {
console.error('Post-process failed:', err);
process.exitCode = 1;
});

View File

@@ -0,0 +1,183 @@
import { chromium as playwright } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs/promises';
import path from 'path';
playwright.use(StealthPlugin());
type Product = {
name: string;
brand?: string;
price?: number;
size?: string;
category?: string;
url?: string;
imageUrl?: string;
inStock: boolean;
};
type BrandGroup = {
brand: string;
products: Product[];
};
const TARGET_URL = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
const JSON_PATH = path.join(OUTPUT_DIR, 'inventory-by-brand.json');
async function ensureDirs(): Promise<void> {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
}
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < maxWaitMs) {
const title = await page.title().catch(() => '');
const content = await page.content().catch(() => '');
const challenge =
title.includes('Attention Required') ||
title.includes('Just a moment') ||
content.includes('challenge-platform') ||
content.includes('cf-challenge');
if (!challenge) return true;
await page.waitForTimeout(2000);
}
return false;
}
async function loadAllProducts(page: any): Promise<void> {
const maxScrolls = 40;
for (let i = 0; i < maxScrolls; i++) {
const beforeCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
await page.mouse.wheel(0, 1400);
await page.waitForTimeout(900);
const afterCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
if (afterCount <= beforeCount) break;
}
await page.evaluate(() => window.scrollTo({ top: 0 }));
}
async function extractProducts(page: any): Promise<Product[]> {
const script = `
(() => {
function parsePrice(text) {
if (!text) return undefined;
const match = text.match(/\\$?(\\d+(?:\\.\\d{1,2})?)/);
return match ? parseFloat(match[1]) : undefined;
}
function pickImage(card) {
const imgEl =
card.querySelector('img[src^="http"]') ||
card.querySelector('source[srcset]');
if (imgEl && imgEl.src && imgEl.src.startsWith('http')) {
return imgEl.src;
}
if (imgEl && imgEl.srcset) {
const first = imgEl.srcset.split(',')[0]?.trim().split(' ')[0];
if (first && first.startsWith('http')) return first;
}
const dataSrc = card.querySelector('img[data-src]')?.getAttribute('data-src');
if (dataSrc && dataSrc.startsWith('http')) return dataSrc;
return undefined;
}
const cards = Array.from(
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
);
return cards
.map((card) => {
const name =
card.querySelector('[data-testid="product-card-name"]')?.innerText?.trim() ||
card.querySelector('[data-testid="product-name"]')?.innerText?.trim() ||
card.querySelector('h3, h4')?.innerText?.trim() ||
(card.textContent || '').split('\\n').map((t) => t.trim()).find((t) => t.length > 3) ||
'';
const brand =
card.querySelector('[data-testid="product-card-brand"]')?.innerText?.trim() ||
card.querySelector('[data-testid="product-brand"]')?.innerText?.trim() ||
undefined;
const priceText =
card.querySelector('[data-testid="product-card-price"]')?.innerText ||
card.textContent ||
'';
const price = parsePrice(priceText);
const size =
card.querySelector('[data-testid*="size"]')?.innerText?.trim() ||
card.querySelector('[data-testid*="weight"]')?.innerText?.trim() ||
undefined;
const category =
card.querySelector('[data-testid*="category"]')?.innerText?.trim() ||
undefined;
const link = card.querySelector('a[href*="/product/"]');
const url = link?.href;
const imageUrl = pickImage(card);
const cardText = (card.textContent || '').toLowerCase();
const inStock = !(cardText.includes('sold out') || cardText.includes('out of stock'));
return { name, brand, price, size, category, url, imageUrl, inStock };
})
.filter((p) => p.name);
})();
`;
return page.evaluate(script);
}
function groupByBrand(products: Product[]): BrandGroup[] {
const map = new Map<string, Product[]>();
for (const p of products) {
const key = p.brand || 'Unknown';
if (!map.has(key)) map.set(key, []);
map.get(key)!.push(p);
}
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
}
async function main() {
await ensureDirs();
const browser = await playwright.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
});
const page = await browser.newPage({
viewport: { width: 1300, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
try {
console.log(`Navigating to ${TARGET_URL}...`);
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
const cfOk = await waitForCloudflare(page, 60000);
if (!cfOk) throw new Error('Cloudflare challenge not passed in time');
await page.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
await loadAllProducts(page);
const products = await extractProducts(page);
const grouped = groupByBrand(products);
await fs.writeFile(JSON_PATH, JSON.stringify(grouped, null, 2));
console.log(`Found ${products.length} products across ${grouped.length} brands`);
console.log(`Saved grouped inventory to ${JSON_PATH}`);
} catch (err) {
console.error('Inventory scrape failed:', err);
process.exitCode = 1;
} finally {
await page.context().browser()?.close();
}
}
main();

View File

@@ -0,0 +1,115 @@
import { chromium, Frame } from 'playwright';
type Product = {
name: string;
brand?: string;
price?: number;
size?: string;
category?: string;
url?: string;
};
async function getDutchieFrame(page: any): Promise<Frame> {
const iframeHandle = await page.waitForSelector(
'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]',
{ timeout: 45000 }
);
const frame = await iframeHandle.contentFrame();
if (!frame) {
throw new Error('Unable to access embedded Dutchie iframe.');
}
await frame.waitForLoadState('domcontentloaded', { timeout: 30000 });
return frame;
}
async function loadAllProducts(frame: Frame): Promise<void> {
const maxScrolls = 20;
for (let i = 0; i < maxScrolls; i++) {
const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
await frame.mouse.wheel(0, 1200);
await frame.waitForTimeout(800);
const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
if (afterCount <= beforeCount) break;
}
await frame.evaluate(() => window.scrollTo({ top: 0 }));
}
async function extractProducts(frame: Frame): Promise<Product[]> {
return frame.evaluate(() => {
const cards = Array.from(
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
);
return cards.map((card: Element) => {
const name =
(card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() ||
(card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) ||
'';
const brand =
(card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const priceText =
(card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText ||
(card.textContent || '');
const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
const size =
(card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const category =
(card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
const url = link?.href;
return { name, brand, price, size, category, url };
}).filter(p => p.name);
});
}
async function main() {
const targetUrl = 'https://azdeeplyrooted.com/menu';
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
});
const page = await browser.newPage({
viewport: { width: 1300, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
try {
console.log(`Navigating to ${targetUrl}...`);
await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
const frame = await getDutchieFrame(page);
await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
await loadAllProducts(frame);
const products = await extractProducts(frame);
console.log(`Found ${products.length} products`);
console.log(JSON.stringify(products.slice(0, 20), null, 2));
} catch (err) {
console.error('Scrape failed:', err);
process.exitCode = 1;
} finally {
await browser.close();
}
}
main();

View File

@@ -0,0 +1,191 @@
import { chromium, Frame } from 'playwright';
import fs from 'fs/promises';
import path from 'path';
type Product = {
name: string;
brand?: string;
price?: number;
size?: string;
category?: string;
url?: string;
imageUrl?: string;
};
type ProductWithImagePath = Product & { imagePath?: string };
const TARGET_URL = 'https://azdeeplyrooted.com/menu';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
const JSON_PATH = path.join(OUTPUT_DIR, 'products.json');
async function ensureDirs(): Promise<void> {
await fs.mkdir(IMAGE_DIR, { recursive: true });
}
async function getDutchieFrame(page: any): Promise<Frame> {
const iframeHandle = await page.waitForSelector(
'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]',
{ timeout: 45000 }
);
const frame = await iframeHandle.contentFrame();
if (!frame) {
throw new Error('Unable to access embedded Dutchie iframe.');
}
await frame.waitForLoadState('domcontentloaded', { timeout: 30000 });
return frame;
}
async function loadAllProducts(frame: Frame): Promise<void> {
const maxScrolls = 30;
for (let i = 0; i < maxScrolls; i++) {
const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
await frame.mouse.wheel(0, 1200);
await frame.waitForTimeout(900);
const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
if (afterCount <= beforeCount) break;
}
await frame.evaluate(() => window.scrollTo({ top: 0 }));
}
async function extractProducts(frame: Frame): Promise<Product[]> {
return frame.evaluate(() => {
const cards = Array.from(
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
);
const pickImage = (card: Element): string | undefined => {
const imgEl =
(card.querySelector('img[src^="http"]') as HTMLImageElement | null) ||
(card.querySelector('source[srcset]') as HTMLSourceElement | null);
if (imgEl && 'src' in imgEl && typeof imgEl.src === 'string' && imgEl.src.startsWith('http')) {
return imgEl.src;
}
if (imgEl && 'srcset' in imgEl && typeof (imgEl as any).srcset === 'string') {
const first = (imgEl as any).srcset.split(',')[0]?.trim().split(' ')[0];
if (first?.startsWith('http')) return first;
}
const dataSrc = (card.querySelector('img[data-src]') as HTMLImageElement | null)?.getAttribute('data-src');
if (dataSrc?.startsWith('http')) return dataSrc;
return undefined;
};
return cards
.map((card: Element) => {
const name =
(card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() ||
(card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) ||
'';
const brand =
(card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const priceText =
(card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText ||
(card.textContent || '');
const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
const size =
(card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() ||
(card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const category =
(card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() ||
undefined;
const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
const url = link?.href;
const imageUrl = pickImage(card);
return { name, brand, price, size, category, url, imageUrl };
})
.filter(p => p.name);
});
}
function safeFileName(base: string, ext: string): string {
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'product'}.${ext}`;
}
async function downloadImages(products: Product[]): Promise<ProductWithImagePath[]> {
const results: ProductWithImagePath[] = [];
for (const product of products) {
if (!product.imageUrl) {
results.push(product);
continue;
}
try {
const res = await fetch(product.imageUrl);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const arrayBuffer = await res.arrayBuffer();
const contentType = res.headers.get('content-type') || '';
const extFromType =
contentType.includes('png') ? 'png' :
contentType.includes('jpeg') ? 'jpg' :
contentType.includes('jpg') ? 'jpg' :
contentType.includes('webp') ? 'webp' :
contentType.includes('gif') ? 'gif' : 'bin';
const urlExt = path.extname(new URL(product.imageUrl).pathname).replace('.', '');
const ext = urlExt || extFromType || 'bin';
const fileName = safeFileName(product.name || 'product', ext);
const filePath = path.join(IMAGE_DIR, fileName);
await fs.writeFile(filePath, Buffer.from(arrayBuffer));
results.push({ ...product, imagePath: filePath });
} catch (err) {
console.warn(`Failed to download image for ${product.name}: ${err}`);
results.push(product);
}
}
return results;
}
async function main() {
await ensureDirs();
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
});
const page = await browser.newPage({
viewport: { width: 1300, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
try {
console.log(`Navigating to ${TARGET_URL}...`);
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
const frame = await getDutchieFrame(page);
await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
await loadAllProducts(frame);
const products = await extractProducts(frame);
console.log(`Found ${products.length} products, downloading images...`);
const withImages = await downloadImages(products);
await fs.writeFile(JSON_PATH, JSON.stringify(withImages, null, 2));
console.log(`Saved data to ${JSON_PATH}`);
console.log(`Images stored in ${IMAGE_DIR}`);
} catch (err) {
console.error('Scrape failed:', err);
process.exitCode = 1;
} finally {
await browser.close();
}
}
main();