fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
214
backend/new-scrapers/fetch-dutchie-product.ts
Normal file
214
backend/new-scrapers/fetch-dutchie-product.ts
Normal file
@@ -0,0 +1,214 @@
|
||||
import { chromium as playwright } from 'playwright-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
playwright.use(StealthPlugin());
|
||||
|
||||
type ProductVariant = {
|
||||
label: string;
|
||||
price?: number;
|
||||
inventory?: string;
|
||||
};
|
||||
|
||||
type ProductData = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
description?: string;
|
||||
thc?: string;
|
||||
cbd?: string;
|
||||
category?: string;
|
||||
variants?: ProductVariant[];
|
||||
images: string[];
|
||||
productUrl: string;
|
||||
};
|
||||
|
||||
const PRODUCT_URL =
|
||||
'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product');
|
||||
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
|
||||
const JSON_PATH = path.join(OUTPUT_DIR, 'product.json');
|
||||
|
||||
async function ensureDirs() {
|
||||
await fs.mkdir(IMAGE_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < maxWaitMs) {
|
||||
const title = await page.title().catch(() => '');
|
||||
const content = await page.content().catch(() => '');
|
||||
const challenge =
|
||||
title.includes('Attention Required') ||
|
||||
title.includes('Just a moment') ||
|
||||
content.includes('challenge-platform') ||
|
||||
content.includes('cf-challenge');
|
||||
if (!challenge) return true;
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function extractProduct(page: any): Promise<ProductData> {
|
||||
return page.evaluate(() => {
|
||||
const pickText = (selectorList: string[]): string | undefined => {
|
||||
for (const sel of selectorList) {
|
||||
const el = document.querySelector(sel) as HTMLElement | null;
|
||||
const txt = el?.innerText?.trim();
|
||||
if (txt) return txt;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
const pickAllTexts = (selector: string): string[] =>
|
||||
Array.from(document.querySelectorAll(selector))
|
||||
.map(el => (el as HTMLElement).innerText?.trim())
|
||||
.filter(Boolean) as string[];
|
||||
|
||||
const parsePrice = (text?: string | null): number | undefined => {
|
||||
if (!text) return undefined;
|
||||
const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/);
|
||||
return match ? parseFloat(match[1]) : undefined;
|
||||
};
|
||||
|
||||
const name =
|
||||
pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || '';
|
||||
const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']);
|
||||
const priceText =
|
||||
pickText([
|
||||
'[data-testid="product-price"]',
|
||||
'[data-testid*="price"]',
|
||||
'[class*="Price"]'
|
||||
]) || '';
|
||||
const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']);
|
||||
|
||||
const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]');
|
||||
const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined;
|
||||
const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined;
|
||||
|
||||
const category =
|
||||
pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined;
|
||||
|
||||
const variantEls = Array.from(
|
||||
document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]')
|
||||
);
|
||||
const variants = variantEls.map(el => {
|
||||
const label =
|
||||
(el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() ||
|
||||
el.textContent?.trim() ||
|
||||
'';
|
||||
const price = parsePrice(el.textContent || undefined);
|
||||
return { label, price };
|
||||
}).filter(v => v.label);
|
||||
|
||||
const imageUrls = Array.from(
|
||||
document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]')
|
||||
).map(el => {
|
||||
if (el instanceof HTMLImageElement) return el.src;
|
||||
const srcset = (el as HTMLSourceElement).srcset || '';
|
||||
return srcset.split(',')[0]?.trim().split(' ')[0];
|
||||
}).filter((u): u is string => !!u);
|
||||
|
||||
return {
|
||||
name,
|
||||
brand,
|
||||
price: parsePrice(priceText),
|
||||
description,
|
||||
thc,
|
||||
cbd,
|
||||
category,
|
||||
variants,
|
||||
images: Array.from(new Set(imageUrls)),
|
||||
productUrl: window.location.href,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function safeFileName(base: string, ext: string): string {
|
||||
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`;
|
||||
}
|
||||
|
||||
async function downloadImages(urls: string[]): Promise<string[]> {
|
||||
const saved: string[] = [];
|
||||
for (const url of urls) {
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const buf = Buffer.from(await res.arrayBuffer());
|
||||
const contentType = res.headers.get('content-type') || '';
|
||||
const urlExt = path.extname(new URL(url).pathname).replace('.', '');
|
||||
const ext =
|
||||
urlExt ||
|
||||
(contentType.includes('png')
|
||||
? 'png'
|
||||
: contentType.includes('jpeg')
|
||||
? 'jpg'
|
||||
: contentType.includes('webp')
|
||||
? 'webp'
|
||||
: 'bin');
|
||||
const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext);
|
||||
const filePath = path.join(IMAGE_DIR, fileName);
|
||||
await fs.writeFile(filePath, buf);
|
||||
saved.push(filePath);
|
||||
} catch (err) {
|
||||
console.warn(`Failed to download image ${url}:`, err);
|
||||
}
|
||||
}
|
||||
return saved;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureDirs();
|
||||
|
||||
const browser = await playwright.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1280, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
console.log(`Navigating to product page...`);
|
||||
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
|
||||
|
||||
const cfOk = await waitForCloudflare(page, 60000);
|
||||
if (!cfOk) {
|
||||
throw new Error('Cloudflare challenge not passed in time');
|
||||
}
|
||||
|
||||
await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined);
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
const product = await extractProduct(page);
|
||||
console.log('Extracted product:');
|
||||
console.log(product);
|
||||
|
||||
const imagePaths = await downloadImages(product.images);
|
||||
const finalProduct = { ...product, imagePaths };
|
||||
|
||||
await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2));
|
||||
|
||||
console.log(`Saved product JSON to ${JSON_PATH}`);
|
||||
if (imagePaths.length) {
|
||||
console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to scrape product:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
227
backend/new-scrapers/graphql-deeply-rooted-products.ts
Normal file
227
backend/new-scrapers/graphql-deeply-rooted-products.ts
Normal file
@@ -0,0 +1,227 @@
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { chromium } from 'playwright-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
chromium.use(StealthPlugin());
|
||||
|
||||
type Option = {
|
||||
option?: string | null;
|
||||
price?: number | null;
|
||||
quantity?: number | null;
|
||||
kioskQuantity?: number | null;
|
||||
};
|
||||
|
||||
type Product = {
|
||||
id: string;
|
||||
slug?: string;
|
||||
name: string;
|
||||
brand?: string;
|
||||
type?: string;
|
||||
category?: string;
|
||||
strainType?: string | null;
|
||||
status?: string | null;
|
||||
price?: number | null;
|
||||
specialPrice?: number | null;
|
||||
image?: string | null;
|
||||
inStock: boolean;
|
||||
options: Option[];
|
||||
raw?: any;
|
||||
};
|
||||
|
||||
const DISPENSARY_SLUG = 'AZ-Deeply-Rooted';
|
||||
const DISPENSARY_ID = '6405ef617056e8014d79101b';
|
||||
const HASH_FILTERED_PRODUCTS = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
|
||||
const OUTPUT_FILE = path.join(OUTPUT_DIR, 'graphql-products.json');
|
||||
|
||||
async function ensureOutputDir() {
|
||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function fetchAllProducts(): Promise<Product[]> {
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1300, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.7390.37 Safari/537.36',
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
await page.goto(`https://dutchie.com/embedded-menu/${DISPENSARY_SLUG}`, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 90000,
|
||||
});
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const products: any[] = await page.evaluate(
|
||||
async ({ dispensaryId, hash }) => {
|
||||
const sessionRaw = localStorage.getItem('dutchie-session');
|
||||
const session = sessionRaw ? sessionRaw.replace(/^\"|\"$/g, '') : '';
|
||||
|
||||
const all: any[] = [];
|
||||
const perPage = 100;
|
||||
|
||||
for (let pageIdx = 0; pageIdx < 40; pageIdx++) {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active', // set to null to try to include inactive if exposed
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: pageIdx,
|
||||
perPage,
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
}),
|
||||
});
|
||||
|
||||
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'x-dutchie-session': session,
|
||||
'content-type': 'application/json',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
console.warn(`Request failed ${res.status} on page ${pageIdx}`);
|
||||
break;
|
||||
}
|
||||
|
||||
const json = await res.json();
|
||||
const chunk = json?.data?.filteredProducts?.products || [];
|
||||
all.push(...chunk);
|
||||
|
||||
if (chunk.length < perPage) break;
|
||||
}
|
||||
|
||||
return all;
|
||||
},
|
||||
{ dispensaryId: DISPENSARY_ID, hash: HASH_FILTERED_PRODUCTS }
|
||||
);
|
||||
|
||||
await browser.close();
|
||||
return normalizeProducts(products);
|
||||
}
|
||||
|
||||
function normalizeProducts(items: any[]): Product[] {
|
||||
return items.map((p) => {
|
||||
const options: Option[] =
|
||||
p?.POSMetaData?.children?.map((child: any) => ({
|
||||
option: child.option ?? null,
|
||||
price:
|
||||
child.recPrice ??
|
||||
child.price ??
|
||||
child.medPrice ??
|
||||
null,
|
||||
quantity:
|
||||
child.quantity ??
|
||||
child.quantityAvailable ??
|
||||
null,
|
||||
kioskQuantity: child.kioskQuantityAvailable ?? null,
|
||||
})) || [];
|
||||
|
||||
const basePrice =
|
||||
(p.recSpecialPrices && p.recSpecialPrices[0]) ??
|
||||
(p.recPrices && p.recPrices[0]) ??
|
||||
(p.Prices && p.Prices[0]) ??
|
||||
null;
|
||||
|
||||
const image =
|
||||
p.Image ||
|
||||
(p.images && p.images.find((img: any) => img.active)?.url) ||
|
||||
null;
|
||||
|
||||
const inStock =
|
||||
options.some(
|
||||
(o) =>
|
||||
(o.quantity ?? 0) > 0 ||
|
||||
(o.kioskQuantity ?? 0) > 0
|
||||
) ||
|
||||
!p.isBelowThreshold;
|
||||
|
||||
return {
|
||||
id: p.id || p._id,
|
||||
slug: p.cName,
|
||||
name: p.Name,
|
||||
brand: p.brandName || p.brand?.name,
|
||||
type: p.type,
|
||||
category: p.subcategory,
|
||||
strainType: p.strainType,
|
||||
status: p.Status,
|
||||
price: basePrice,
|
||||
specialPrice:
|
||||
(p.recSpecialPrices && p.recSpecialPrices[0]) ||
|
||||
(p.medicalSpecialPrices && p.medicalSpecialPrices[0]) ||
|
||||
null,
|
||||
image,
|
||||
inStock,
|
||||
options,
|
||||
raw: undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function summarize(products: Product[]) {
|
||||
const total = products.length;
|
||||
const inStock = products.filter((p) => p.inStock).length;
|
||||
const outOfStock = total - inStock;
|
||||
const byBrand = new Map<string, number>();
|
||||
for (const p of products) {
|
||||
const key = (p.brand || 'Unknown').trim();
|
||||
byBrand.set(key, (byBrand.get(key) || 0) + 1);
|
||||
}
|
||||
const topBrands = Array.from(byBrand.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 10);
|
||||
return { total, inStock, outOfStock, topBrands };
|
||||
}
|
||||
|
||||
function formatSample(products: Product[], n = 5) {
|
||||
return products.slice(0, n).map((p) => ({
|
||||
name: p.name,
|
||||
brand: p.brand,
|
||||
price: p.price,
|
||||
specialPrice: p.specialPrice,
|
||||
inStock: p.inStock,
|
||||
options: p.options,
|
||||
}));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureOutputDir();
|
||||
const products = await fetchAllProducts();
|
||||
await fs.writeFile(OUTPUT_FILE, JSON.stringify(products, null, 2));
|
||||
|
||||
const summary = summarize(products);
|
||||
console.log(`Saved ${products.length} products to ${OUTPUT_FILE}`);
|
||||
console.log(`In stock: ${summary.inStock} | Out of stock: ${summary.outOfStock}`);
|
||||
console.log('Top brands:', summary.topBrands);
|
||||
console.log('Sample:', JSON.stringify(formatSample(products, 5), null, 2));
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('GraphQL scrape failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
90
backend/new-scrapers/postprocess-deeply-rooted-clean.js
Normal file
90
backend/new-scrapers/postprocess-deeply-rooted-clean.js
Normal file
@@ -0,0 +1,90 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const INPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'inventory-by-brand.json');
|
||||
const OUTPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'cleaned-inventory.json');
|
||||
|
||||
function extractPrice(text, fallback) {
|
||||
const prices = Array.from(text.matchAll(/\$([0-9]+(?:\.[0-9]{2})?)/g)).map((m) => parseFloat(m[1]));
|
||||
if (prices.length > 0) {
|
||||
return Math.min(...prices);
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
function cleanBrandAndName(rawName, rawBrand) {
|
||||
const parts = rawName.split('…').map((p) => p.trim()).filter(Boolean);
|
||||
const name = parts[0] || rawName.trim();
|
||||
const inferredBrand = parts[1]?.replace(/[^a-z0-9\s\-\&']/gi, ' ').replace(/\s+/g, ' ').trim();
|
||||
const brand = normalizeBrand((rawBrand || inferredBrand || 'Unknown').trim());
|
||||
return { name, brand };
|
||||
}
|
||||
|
||||
function cleanProduct(p) {
|
||||
const { name, brand } = cleanBrandAndName(p.name, p.brand);
|
||||
const price = extractPrice(p.name, p.price);
|
||||
return {
|
||||
name,
|
||||
brand: brand || 'Unknown',
|
||||
price,
|
||||
size: p.size,
|
||||
category: p.category,
|
||||
url: p.url,
|
||||
imageUrl: p.imageUrl,
|
||||
inStock: p.inStock !== false,
|
||||
};
|
||||
}
|
||||
|
||||
function dedupe(products) {
|
||||
const seen = new Map();
|
||||
for (const p of products) {
|
||||
const key = (p.url || `${p.name.toLowerCase()}|${p.brand.toLowerCase()}`).trim();
|
||||
if (!seen.has(key)) {
|
||||
seen.set(key, p);
|
||||
}
|
||||
}
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
function groupByBrand(products) {
|
||||
const map = new Map();
|
||||
for (const p of products) {
|
||||
const key = p.brand || 'Unknown';
|
||||
if (!map.has(key)) map.set(key, []);
|
||||
map.get(key).push(p);
|
||||
}
|
||||
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
|
||||
}
|
||||
|
||||
function normalizeBrand(brand) {
|
||||
const replacements = {
|
||||
'Gr n': 'Gron',
|
||||
};
|
||||
return replacements[brand] || brand;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const raw = JSON.parse(fs.readFileSync(INPUT, 'utf8'));
|
||||
const flattened = [];
|
||||
|
||||
for (const group of raw) {
|
||||
for (const p of group.products) {
|
||||
flattened.push(cleanProduct(p));
|
||||
}
|
||||
}
|
||||
|
||||
const unique = dedupe(flattened);
|
||||
const grouped = groupByBrand(unique);
|
||||
|
||||
fs.writeFileSync(OUTPUT, JSON.stringify(grouped, null, 2));
|
||||
|
||||
const total = unique.length;
|
||||
const outOfStock = unique.filter((p) => !p.inStock).length;
|
||||
|
||||
console.log(`Cleaned products: ${total}`);
|
||||
console.log(`Out of stock: ${outOfStock}`);
|
||||
console.log(`Brands: ${grouped.length}`);
|
||||
console.log(`Saved to ${OUTPUT}`);
|
||||
}
|
||||
|
||||
main();
|
||||
114
backend/new-scrapers/postprocess-deeply-rooted-clean.ts
Normal file
114
backend/new-scrapers/postprocess-deeply-rooted-clean.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
type RawProduct = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
inStock?: boolean;
|
||||
};
|
||||
|
||||
type BrandGroup = {
|
||||
brand: string;
|
||||
products: CleanProduct[];
|
||||
};
|
||||
|
||||
type CleanProduct = {
|
||||
name: string;
|
||||
brand: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
inStock: boolean;
|
||||
};
|
||||
|
||||
const INPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'inventory-by-brand.json');
|
||||
const OUTPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'cleaned-inventory.json');
|
||||
|
||||
function extractPrice(text: string, fallback?: number): number | undefined {
|
||||
const prices = [...text.matchAll(/\$([0-9]+(?:\.[0-9]{2})?)/g)].map((m) => parseFloat(m[1]));
|
||||
if (prices.length > 0) {
|
||||
// Use the lowest price (usually the sale price)
|
||||
return Math.min(...prices);
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
function cleanBrandAndName(rawName: string, rawBrand?: string): { name: string; brand: string } {
|
||||
const parts = rawName.split('…').map((p) => p.trim()).filter(Boolean);
|
||||
const name = parts[0] || rawName.trim();
|
||||
const inferredBrand = parts[1]?.replace(/[^a-z0-9\s\-\&']/gi, ' ').replace(/\s+/g, ' ').trim();
|
||||
const brand = (rawBrand || inferredBrand || 'Unknown').trim();
|
||||
return { name, brand };
|
||||
}
|
||||
|
||||
function cleanProduct(p: RawProduct): CleanProduct {
|
||||
const { name, brand } = cleanBrandAndName(p.name, p.brand);
|
||||
const price = extractPrice(p.name, p.price);
|
||||
return {
|
||||
name,
|
||||
brand: brand || 'Unknown',
|
||||
price,
|
||||
size: p.size,
|
||||
category: p.category,
|
||||
url: p.url,
|
||||
imageUrl: p.imageUrl,
|
||||
inStock: p.inStock !== false,
|
||||
};
|
||||
}
|
||||
|
||||
function dedupe(products: CleanProduct[]): CleanProduct[] {
|
||||
const seen = new Map<string, CleanProduct>();
|
||||
for (const p of products) {
|
||||
const key = (p.url || `${p.name.toLowerCase()}|${p.brand.toLowerCase()}`).trim();
|
||||
if (!seen.has(key)) {
|
||||
seen.set(key, p);
|
||||
}
|
||||
}
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
function groupByBrand(products: CleanProduct[]): BrandGroup[] {
|
||||
const map = new Map<string, CleanProduct[]>();
|
||||
for (const p of products) {
|
||||
const key = p.brand || 'Unknown';
|
||||
if (!map.has(key)) map.set(key, []);
|
||||
map.get(key)!.push(p);
|
||||
}
|
||||
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const raw = JSON.parse(await fs.readFile(INPUT, 'utf8')) as { brand: string; products: RawProduct[] }[];
|
||||
const flattened: CleanProduct[] = [];
|
||||
|
||||
for (const group of raw) {
|
||||
for (const p of group.products) {
|
||||
flattened.push(cleanProduct(p));
|
||||
}
|
||||
}
|
||||
|
||||
const unique = dedupe(flattened);
|
||||
const grouped = groupByBrand(unique);
|
||||
|
||||
await fs.writeFile(OUTPUT, JSON.stringify(grouped, null, 2));
|
||||
|
||||
const total = unique.length;
|
||||
const outOfStock = unique.filter((p) => !p.inStock).length;
|
||||
|
||||
console.log(`Cleaned products: ${total}`);
|
||||
console.log(`Out of stock: ${outOfStock}`);
|
||||
console.log(`Brands: ${grouped.length}`);
|
||||
console.log(`Saved to ${OUTPUT}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Post-process failed:', err);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
183
backend/new-scrapers/scrape-deeply-rooted-inventory-by-brand.ts
Normal file
183
backend/new-scrapers/scrape-deeply-rooted-inventory-by-brand.ts
Normal file
@@ -0,0 +1,183 @@
|
||||
import { chromium as playwright } from 'playwright-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
playwright.use(StealthPlugin());
|
||||
|
||||
type Product = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
inStock: boolean;
|
||||
};
|
||||
|
||||
type BrandGroup = {
|
||||
brand: string;
|
||||
products: Product[];
|
||||
};
|
||||
|
||||
const TARGET_URL = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
|
||||
const JSON_PATH = path.join(OUTPUT_DIR, 'inventory-by-brand.json');
|
||||
|
||||
async function ensureDirs(): Promise<void> {
|
||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < maxWaitMs) {
|
||||
const title = await page.title().catch(() => '');
|
||||
const content = await page.content().catch(() => '');
|
||||
const challenge =
|
||||
title.includes('Attention Required') ||
|
||||
title.includes('Just a moment') ||
|
||||
content.includes('challenge-platform') ||
|
||||
content.includes('cf-challenge');
|
||||
if (!challenge) return true;
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function loadAllProducts(page: any): Promise<void> {
|
||||
const maxScrolls = 40;
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
const beforeCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
|
||||
await page.mouse.wheel(0, 1400);
|
||||
await page.waitForTimeout(900);
|
||||
const afterCount = await page.$$eval('[data-testid*="product"], [data-testid*="card"]', (els) => els.length);
|
||||
if (afterCount <= beforeCount) break;
|
||||
}
|
||||
await page.evaluate(() => window.scrollTo({ top: 0 }));
|
||||
}
|
||||
|
||||
async function extractProducts(page: any): Promise<Product[]> {
|
||||
const script = `
|
||||
(() => {
|
||||
function parsePrice(text) {
|
||||
if (!text) return undefined;
|
||||
const match = text.match(/\\$?(\\d+(?:\\.\\d{1,2})?)/);
|
||||
return match ? parseFloat(match[1]) : undefined;
|
||||
}
|
||||
|
||||
function pickImage(card) {
|
||||
const imgEl =
|
||||
card.querySelector('img[src^="http"]') ||
|
||||
card.querySelector('source[srcset]');
|
||||
if (imgEl && imgEl.src && imgEl.src.startsWith('http')) {
|
||||
return imgEl.src;
|
||||
}
|
||||
if (imgEl && imgEl.srcset) {
|
||||
const first = imgEl.srcset.split(',')[0]?.trim().split(' ')[0];
|
||||
if (first && first.startsWith('http')) return first;
|
||||
}
|
||||
const dataSrc = card.querySelector('img[data-src]')?.getAttribute('data-src');
|
||||
if (dataSrc && dataSrc.startsWith('http')) return dataSrc;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const cards = Array.from(
|
||||
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
|
||||
);
|
||||
|
||||
return cards
|
||||
.map((card) => {
|
||||
const name =
|
||||
card.querySelector('[data-testid="product-card-name"]')?.innerText?.trim() ||
|
||||
card.querySelector('[data-testid="product-name"]')?.innerText?.trim() ||
|
||||
card.querySelector('h3, h4')?.innerText?.trim() ||
|
||||
(card.textContent || '').split('\\n').map((t) => t.trim()).find((t) => t.length > 3) ||
|
||||
'';
|
||||
|
||||
const brand =
|
||||
card.querySelector('[data-testid="product-card-brand"]')?.innerText?.trim() ||
|
||||
card.querySelector('[data-testid="product-brand"]')?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const priceText =
|
||||
card.querySelector('[data-testid="product-card-price"]')?.innerText ||
|
||||
card.textContent ||
|
||||
'';
|
||||
const price = parsePrice(priceText);
|
||||
|
||||
const size =
|
||||
card.querySelector('[data-testid*="size"]')?.innerText?.trim() ||
|
||||
card.querySelector('[data-testid*="weight"]')?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const category =
|
||||
card.querySelector('[data-testid*="category"]')?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const link = card.querySelector('a[href*="/product/"]');
|
||||
const url = link?.href;
|
||||
|
||||
const imageUrl = pickImage(card);
|
||||
|
||||
const cardText = (card.textContent || '').toLowerCase();
|
||||
const inStock = !(cardText.includes('sold out') || cardText.includes('out of stock'));
|
||||
|
||||
return { name, brand, price, size, category, url, imageUrl, inStock };
|
||||
})
|
||||
.filter((p) => p.name);
|
||||
})();
|
||||
`;
|
||||
|
||||
return page.evaluate(script);
|
||||
}
|
||||
|
||||
function groupByBrand(products: Product[]): BrandGroup[] {
|
||||
const map = new Map<string, Product[]>();
|
||||
for (const p of products) {
|
||||
const key = p.brand || 'Unknown';
|
||||
if (!map.has(key)) map.set(key, []);
|
||||
map.get(key)!.push(p);
|
||||
}
|
||||
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureDirs();
|
||||
|
||||
const browser = await playwright.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage({
|
||||
viewport: { width: 1300, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`Navigating to ${TARGET_URL}...`);
|
||||
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
|
||||
|
||||
const cfOk = await waitForCloudflare(page, 60000);
|
||||
if (!cfOk) throw new Error('Cloudflare challenge not passed in time');
|
||||
|
||||
await page.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
|
||||
|
||||
await loadAllProducts(page);
|
||||
const products = await extractProducts(page);
|
||||
const grouped = groupByBrand(products);
|
||||
|
||||
await fs.writeFile(JSON_PATH, JSON.stringify(grouped, null, 2));
|
||||
console.log(`Found ${products.length} products across ${grouped.length} brands`);
|
||||
console.log(`Saved grouped inventory to ${JSON_PATH}`);
|
||||
} catch (err) {
|
||||
console.error('Inventory scrape failed:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await page.context().browser()?.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
115
backend/new-scrapers/scrape-deeply-rooted-playwright.ts
Normal file
115
backend/new-scrapers/scrape-deeply-rooted-playwright.ts
Normal file
@@ -0,0 +1,115 @@
|
||||
import { chromium, Frame } from 'playwright';
|
||||
|
||||
type Product = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
};
|
||||
|
||||
async function getDutchieFrame(page: any): Promise<Frame> {
|
||||
const iframeHandle = await page.waitForSelector(
|
||||
'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]',
|
||||
{ timeout: 45000 }
|
||||
);
|
||||
|
||||
const frame = await iframeHandle.contentFrame();
|
||||
if (!frame) {
|
||||
throw new Error('Unable to access embedded Dutchie iframe.');
|
||||
}
|
||||
|
||||
await frame.waitForLoadState('domcontentloaded', { timeout: 30000 });
|
||||
return frame;
|
||||
}
|
||||
|
||||
async function loadAllProducts(frame: Frame): Promise<void> {
|
||||
const maxScrolls = 20;
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
await frame.mouse.wheel(0, 1200);
|
||||
await frame.waitForTimeout(800);
|
||||
const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
if (afterCount <= beforeCount) break;
|
||||
}
|
||||
await frame.evaluate(() => window.scrollTo({ top: 0 }));
|
||||
}
|
||||
|
||||
async function extractProducts(frame: Frame): Promise<Product[]> {
|
||||
return frame.evaluate(() => {
|
||||
const cards = Array.from(
|
||||
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
|
||||
);
|
||||
|
||||
return cards.map((card: Element) => {
|
||||
const name =
|
||||
(card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) ||
|
||||
'';
|
||||
|
||||
const brand =
|
||||
(card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const priceText =
|
||||
(card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText ||
|
||||
(card.textContent || '');
|
||||
const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||
|
||||
const size =
|
||||
(card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const category =
|
||||
(card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
|
||||
const url = link?.href;
|
||||
|
||||
return { name, brand, price, size, category, url };
|
||||
}).filter(p => p.name);
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const targetUrl = 'https://azdeeplyrooted.com/menu';
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage({
|
||||
viewport: { width: 1300, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`Navigating to ${targetUrl}...`);
|
||||
await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
|
||||
const frame = await getDutchieFrame(page);
|
||||
await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
|
||||
|
||||
await loadAllProducts(frame);
|
||||
const products = await extractProducts(frame);
|
||||
|
||||
console.log(`Found ${products.length} products`);
|
||||
console.log(JSON.stringify(products.slice(0, 20), null, 2));
|
||||
} catch (err) {
|
||||
console.error('Scrape failed:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
191
backend/new-scrapers/scrape-deeply-rooted-with-images.ts
Normal file
191
backend/new-scrapers/scrape-deeply-rooted-with-images.ts
Normal file
@@ -0,0 +1,191 @@
|
||||
import { chromium, Frame } from 'playwright';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
type Product = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
};
|
||||
|
||||
type ProductWithImagePath = Product & { imagePath?: string };
|
||||
|
||||
const TARGET_URL = 'https://azdeeplyrooted.com/menu';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
|
||||
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
|
||||
const JSON_PATH = path.join(OUTPUT_DIR, 'products.json');
|
||||
|
||||
async function ensureDirs(): Promise<void> {
|
||||
await fs.mkdir(IMAGE_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function getDutchieFrame(page: any): Promise<Frame> {
|
||||
const iframeHandle = await page.waitForSelector(
|
||||
'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]',
|
||||
{ timeout: 45000 }
|
||||
);
|
||||
|
||||
const frame = await iframeHandle.contentFrame();
|
||||
if (!frame) {
|
||||
throw new Error('Unable to access embedded Dutchie iframe.');
|
||||
}
|
||||
|
||||
await frame.waitForLoadState('domcontentloaded', { timeout: 30000 });
|
||||
return frame;
|
||||
}
|
||||
|
||||
async function loadAllProducts(frame: Frame): Promise<void> {
|
||||
const maxScrolls = 30;
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
await frame.mouse.wheel(0, 1200);
|
||||
await frame.waitForTimeout(900);
|
||||
const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
if (afterCount <= beforeCount) break;
|
||||
}
|
||||
await frame.evaluate(() => window.scrollTo({ top: 0 }));
|
||||
}
|
||||
|
||||
async function extractProducts(frame: Frame): Promise<Product[]> {
|
||||
return frame.evaluate(() => {
|
||||
const cards = Array.from(
|
||||
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
|
||||
);
|
||||
|
||||
const pickImage = (card: Element): string | undefined => {
|
||||
const imgEl =
|
||||
(card.querySelector('img[src^="http"]') as HTMLImageElement | null) ||
|
||||
(card.querySelector('source[srcset]') as HTMLSourceElement | null);
|
||||
if (imgEl && 'src' in imgEl && typeof imgEl.src === 'string' && imgEl.src.startsWith('http')) {
|
||||
return imgEl.src;
|
||||
}
|
||||
if (imgEl && 'srcset' in imgEl && typeof (imgEl as any).srcset === 'string') {
|
||||
const first = (imgEl as any).srcset.split(',')[0]?.trim().split(' ')[0];
|
||||
if (first?.startsWith('http')) return first;
|
||||
}
|
||||
const dataSrc = (card.querySelector('img[data-src]') as HTMLImageElement | null)?.getAttribute('data-src');
|
||||
if (dataSrc?.startsWith('http')) return dataSrc;
|
||||
return undefined;
|
||||
};
|
||||
|
||||
return cards
|
||||
.map((card: Element) => {
|
||||
const name =
|
||||
(card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) ||
|
||||
'';
|
||||
|
||||
const brand =
|
||||
(card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const priceText =
|
||||
(card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText ||
|
||||
(card.textContent || '');
|
||||
const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||
|
||||
const size =
|
||||
(card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const category =
|
||||
(card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
|
||||
const url = link?.href;
|
||||
|
||||
const imageUrl = pickImage(card);
|
||||
|
||||
return { name, brand, price, size, category, url, imageUrl };
|
||||
})
|
||||
.filter(p => p.name);
|
||||
});
|
||||
}
|
||||
|
||||
function safeFileName(base: string, ext: string): string {
|
||||
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'product'}.${ext}`;
|
||||
}
|
||||
|
||||
async function downloadImages(products: Product[]): Promise<ProductWithImagePath[]> {
|
||||
const results: ProductWithImagePath[] = [];
|
||||
for (const product of products) {
|
||||
if (!product.imageUrl) {
|
||||
results.push(product);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(product.imageUrl);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const arrayBuffer = await res.arrayBuffer();
|
||||
const contentType = res.headers.get('content-type') || '';
|
||||
const extFromType =
|
||||
contentType.includes('png') ? 'png' :
|
||||
contentType.includes('jpeg') ? 'jpg' :
|
||||
contentType.includes('jpg') ? 'jpg' :
|
||||
contentType.includes('webp') ? 'webp' :
|
||||
contentType.includes('gif') ? 'gif' : 'bin';
|
||||
|
||||
const urlExt = path.extname(new URL(product.imageUrl).pathname).replace('.', '');
|
||||
const ext = urlExt || extFromType || 'bin';
|
||||
const fileName = safeFileName(product.name || 'product', ext);
|
||||
const filePath = path.join(IMAGE_DIR, fileName);
|
||||
await fs.writeFile(filePath, Buffer.from(arrayBuffer));
|
||||
results.push({ ...product, imagePath: filePath });
|
||||
} catch (err) {
|
||||
console.warn(`Failed to download image for ${product.name}: ${err}`);
|
||||
results.push(product);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureDirs();
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage({
|
||||
viewport: { width: 1300, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`Navigating to ${TARGET_URL}...`);
|
||||
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
|
||||
const frame = await getDutchieFrame(page);
|
||||
await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
|
||||
|
||||
await loadAllProducts(frame);
|
||||
const products = await extractProducts(frame);
|
||||
console.log(`Found ${products.length} products, downloading images...`);
|
||||
|
||||
const withImages = await downloadImages(products);
|
||||
await fs.writeFile(JSON_PATH, JSON.stringify(withImages, null, 2));
|
||||
|
||||
console.log(`Saved data to ${JSON_PATH}`);
|
||||
console.log(`Images stored in ${IMAGE_DIR}`);
|
||||
} catch (err) {
|
||||
console.error('Scrape failed:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user