Files
cannaiq/backend/dist/scripts/scrape-all-active.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

280 lines
12 KiB
JavaScript

"use strict";
/**
* Scrape ALL active products via direct GraphQL pagination
* This is more reliable than category navigation
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const pg_1 = require("pg");
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
async function scrapeAllProducts(menuUrl, storeId) {
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
const browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
console.log('Loading menu to establish session...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
console.log('Dispensary ID:', dispensaryId);
// Paginate through all products
const allProducts = [];
let pageNum = 0;
const perPage = 100;
console.log('\nFetching all products via paginated GraphQL...');
while (true) {
const result = await page.evaluate(async (dispId, hash, page, perPage) => {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: dispId,
pricingType: 'rec',
Status: 'Active',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
});
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include',
});
const json = await resp.json();
return {
products: json?.data?.filteredProducts?.products || [],
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
};
}, dispensaryId, GRAPHQL_HASH, pageNum, perPage);
if (result.products.length === 0) {
break;
}
allProducts.push(...result.products);
console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`);
pageNum++;
// Safety limit
if (pageNum > 50) {
console.log('Reached page limit');
break;
}
}
console.log(`\nTotal products fetched: ${allProducts.length}`);
// Normalize and upsert
console.log('\nNormalizing and upserting to database...');
const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct);
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of normalized) {
const result = await client.query(`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`, [
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]);
if (result.rows[0]?.was_inserted) {
inserted++;
}
else {
updated++;
}
}
await client.query('COMMIT');
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
// Show summary stats
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_on_special) as specials,
COUNT(DISTINCT brand) as brands,
COUNT(DISTINCT subcategory) as categories
FROM products WHERE store_id = $1
`, [storeId]);
console.log('\nStore summary:');
console.log(` Total products: ${stats.rows[0].total}`);
console.log(` On special: ${stats.rows[0].specials}`);
console.log(` Unique brands: ${stats.rows[0].brands}`);
console.log(` Categories: ${stats.rows[0].categories}`);
return {
success: true,
totalProducts: allProducts.length,
inserted,
updated,
};
}
finally {
await browser.close();
await pool.end();
}
}
// Run
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
const storeId = parseInt(process.argv[3] || '1', 10);
console.log('='.repeat(60));
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
console.log('='.repeat(60));
console.log(`Menu URL: ${menuUrl}`);
console.log(`Store ID: ${storeId}`);
console.log('');
scrapeAllProducts(menuUrl, storeId)
.then((result) => {
console.log('\n' + '='.repeat(60));
console.log('COMPLETE');
console.log(JSON.stringify(result, null, 2));
})
.catch((error) => {
console.error('Error:', error.message);
process.exit(1);
});