Files
cannaiq/backend/new-scrapers/graphql-deeply-rooted-products.ts
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

228 lines
6.3 KiB
TypeScript

import fs from 'fs/promises';
import path from 'path';
import { chromium } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
chromium.use(StealthPlugin());
type Option = {
option?: string | null;
price?: number | null;
quantity?: number | null;
kioskQuantity?: number | null;
};
type Product = {
id: string;
slug?: string;
name: string;
brand?: string;
type?: string;
category?: string;
strainType?: string | null;
status?: string | null;
price?: number | null;
specialPrice?: number | null;
image?: string | null;
inStock: boolean;
options: Option[];
raw?: any;
};
const DISPENSARY_SLUG = 'AZ-Deeply-Rooted';
const DISPENSARY_ID = '6405ef617056e8014d79101b';
const HASH_FILTERED_PRODUCTS = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
const OUTPUT_FILE = path.join(OUTPUT_DIR, 'graphql-products.json');
async function ensureOutputDir() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
}
async function fetchAllProducts(): Promise<Product[]> {
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const context = await browser.newContext({
viewport: { width: 1300, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.7390.37 Safari/537.36',
});
const page = await context.newPage();
await page.goto(`https://dutchie.com/embedded-menu/${DISPENSARY_SLUG}`, {
waitUntil: 'domcontentloaded',
timeout: 90000,
});
await page.waitForTimeout(3000);
const products: any[] = await page.evaluate(
async ({ dispensaryId, hash }) => {
const sessionRaw = localStorage.getItem('dutchie-session');
const session = sessionRaw ? sessionRaw.replace(/^\"|\"$/g, '') : '';
const all: any[] = [];
const perPage = 100;
for (let pageIdx = 0; pageIdx < 40; pageIdx++) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId,
pricingType: 'rec',
Status: 'Active', // set to null to try to include inactive if exposed
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: pageIdx,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({
persistedQuery: { version: 1, sha256Hash: hash },
}),
});
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
const res = await fetch(url, {
headers: {
'apollographql-client-name': 'Marketplace (production)',
'x-dutchie-session': session,
'content-type': 'application/json',
},
credentials: 'include',
});
if (!res.ok) {
console.warn(`Request failed ${res.status} on page ${pageIdx}`);
break;
}
const json = await res.json();
const chunk = json?.data?.filteredProducts?.products || [];
all.push(...chunk);
if (chunk.length < perPage) break;
}
return all;
},
{ dispensaryId: DISPENSARY_ID, hash: HASH_FILTERED_PRODUCTS }
);
await browser.close();
return normalizeProducts(products);
}
function normalizeProducts(items: any[]): Product[] {
return items.map((p) => {
const options: Option[] =
p?.POSMetaData?.children?.map((child: any) => ({
option: child.option ?? null,
price:
child.recPrice ??
child.price ??
child.medPrice ??
null,
quantity:
child.quantity ??
child.quantityAvailable ??
null,
kioskQuantity: child.kioskQuantityAvailable ?? null,
})) || [];
const basePrice =
(p.recSpecialPrices && p.recSpecialPrices[0]) ??
(p.recPrices && p.recPrices[0]) ??
(p.Prices && p.Prices[0]) ??
null;
const image =
p.Image ||
(p.images && p.images.find((img: any) => img.active)?.url) ||
null;
const inStock =
options.some(
(o) =>
(o.quantity ?? 0) > 0 ||
(o.kioskQuantity ?? 0) > 0
) ||
!p.isBelowThreshold;
return {
id: p.id || p._id,
slug: p.cName,
name: p.Name,
brand: p.brandName || p.brand?.name,
type: p.type,
category: p.subcategory,
strainType: p.strainType,
status: p.Status,
price: basePrice,
specialPrice:
(p.recSpecialPrices && p.recSpecialPrices[0]) ||
(p.medicalSpecialPrices && p.medicalSpecialPrices[0]) ||
null,
image,
inStock,
options,
raw: undefined,
};
});
}
function summarize(products: Product[]) {
const total = products.length;
const inStock = products.filter((p) => p.inStock).length;
const outOfStock = total - inStock;
const byBrand = new Map<string, number>();
for (const p of products) {
const key = (p.brand || 'Unknown').trim();
byBrand.set(key, (byBrand.get(key) || 0) + 1);
}
const topBrands = Array.from(byBrand.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10);
return { total, inStock, outOfStock, topBrands };
}
function formatSample(products: Product[], n = 5) {
return products.slice(0, n).map((p) => ({
name: p.name,
brand: p.brand,
price: p.price,
specialPrice: p.specialPrice,
inStock: p.inStock,
options: p.options,
}));
}
async function main() {
await ensureOutputDir();
const products = await fetchAllProducts();
await fs.writeFile(OUTPUT_FILE, JSON.stringify(products, null, 2));
const summary = summarize(products);
console.log(`Saved ${products.length} products to ${OUTPUT_FILE}`);
console.log(`In stock: ${summary.inStock} | Out of stock: ${summary.outOfStock}`);
console.log('Top brands:', summary.topBrands);
console.log('Sample:', JSON.stringify(formatSample(products, 5), null, 2));
}
main().catch((err) => {
console.error('GraphQL scrape failed:', err);
process.exit(1);
});