The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
280 lines
12 KiB
JavaScript
280 lines
12 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Scrape ALL active products via direct GraphQL pagination
|
|
* This is more reliable than category navigation
|
|
*/
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
const pg_1 = require("pg");
|
|
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
|
|
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
|
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
|
async function scrapeAllProducts(menuUrl, storeId) {
|
|
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
|
|
const browser = await puppeteer_extra_1.default.launch({
|
|
headless: 'new',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
|
});
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
|
|
console.log('Loading menu to establish session...');
|
|
await page.goto(menuUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
await new Promise((r) => setTimeout(r, 3000));
|
|
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
|
|
console.log('Dispensary ID:', dispensaryId);
|
|
// Paginate through all products
|
|
const allProducts = [];
|
|
let pageNum = 0;
|
|
const perPage = 100;
|
|
console.log('\nFetching all products via paginated GraphQL...');
|
|
while (true) {
|
|
const result = await page.evaluate(async (dispId, hash, page, perPage) => {
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId: dispId,
|
|
pricingType: 'rec',
|
|
Status: 'Active',
|
|
types: [],
|
|
useCache: false,
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page,
|
|
perPage,
|
|
};
|
|
const qs = new URLSearchParams({
|
|
operationName: 'FilteredProducts',
|
|
variables: JSON.stringify(variables),
|
|
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
|
});
|
|
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
|
method: 'GET',
|
|
headers: {
|
|
'content-type': 'application/json',
|
|
'apollographql-client-name': 'Marketplace (production)',
|
|
},
|
|
credentials: 'include',
|
|
});
|
|
const json = await resp.json();
|
|
return {
|
|
products: json?.data?.filteredProducts?.products || [],
|
|
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
|
};
|
|
}, dispensaryId, GRAPHQL_HASH, pageNum, perPage);
|
|
if (result.products.length === 0) {
|
|
break;
|
|
}
|
|
allProducts.push(...result.products);
|
|
console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`);
|
|
pageNum++;
|
|
// Safety limit
|
|
if (pageNum > 50) {
|
|
console.log('Reached page limit');
|
|
break;
|
|
}
|
|
}
|
|
console.log(`\nTotal products fetched: ${allProducts.length}`);
|
|
// Normalize and upsert
|
|
console.log('\nNormalizing and upserting to database...');
|
|
const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct);
|
|
const client = await pool.connect();
|
|
let inserted = 0;
|
|
let updated = 0;
|
|
try {
|
|
await client.query('BEGIN');
|
|
for (const product of normalized) {
|
|
const result = await client.query(`
|
|
INSERT INTO products (
|
|
store_id, external_id, slug, name, enterprise_product_id,
|
|
brand, brand_external_id, brand_logo_url,
|
|
subcategory, strain_type, canonical_category,
|
|
price, rec_price, med_price, rec_special_price, med_special_price,
|
|
is_on_special, special_name, discount_percent, special_data,
|
|
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
|
thc_percentage, cbd_percentage, cannabinoids,
|
|
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
|
image_url, additional_images,
|
|
is_featured, medical_only, rec_only,
|
|
source_created_at, source_updated_at,
|
|
description, raw_data,
|
|
dutchie_url, last_seen_at, updated_at
|
|
)
|
|
VALUES (
|
|
$1, $2, $3, $4, $5,
|
|
$6, $7, $8,
|
|
$9, $10, $11,
|
|
$12, $13, $14, $15, $16,
|
|
$17, $18, $19, $20,
|
|
$21, $22, $23, $24, $25,
|
|
$26, $27, $28,
|
|
$29, $30, $31, $32, $33,
|
|
$34, $35,
|
|
$36, $37, $38,
|
|
$39, $40,
|
|
$41, $42,
|
|
'', NOW(), NOW()
|
|
)
|
|
ON CONFLICT (store_id, slug) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
|
brand = EXCLUDED.brand,
|
|
brand_external_id = EXCLUDED.brand_external_id,
|
|
brand_logo_url = EXCLUDED.brand_logo_url,
|
|
subcategory = EXCLUDED.subcategory,
|
|
strain_type = EXCLUDED.strain_type,
|
|
canonical_category = EXCLUDED.canonical_category,
|
|
price = EXCLUDED.price,
|
|
rec_price = EXCLUDED.rec_price,
|
|
med_price = EXCLUDED.med_price,
|
|
rec_special_price = EXCLUDED.rec_special_price,
|
|
med_special_price = EXCLUDED.med_special_price,
|
|
is_on_special = EXCLUDED.is_on_special,
|
|
special_name = EXCLUDED.special_name,
|
|
discount_percent = EXCLUDED.discount_percent,
|
|
special_data = EXCLUDED.special_data,
|
|
sku = EXCLUDED.sku,
|
|
inventory_quantity = EXCLUDED.inventory_quantity,
|
|
inventory_available = EXCLUDED.inventory_available,
|
|
is_below_threshold = EXCLUDED.is_below_threshold,
|
|
status = EXCLUDED.status,
|
|
thc_percentage = EXCLUDED.thc_percentage,
|
|
cbd_percentage = EXCLUDED.cbd_percentage,
|
|
cannabinoids = EXCLUDED.cannabinoids,
|
|
weight_mg = EXCLUDED.weight_mg,
|
|
net_weight_value = EXCLUDED.net_weight_value,
|
|
net_weight_unit = EXCLUDED.net_weight_unit,
|
|
options = EXCLUDED.options,
|
|
raw_options = EXCLUDED.raw_options,
|
|
image_url = EXCLUDED.image_url,
|
|
additional_images = EXCLUDED.additional_images,
|
|
is_featured = EXCLUDED.is_featured,
|
|
medical_only = EXCLUDED.medical_only,
|
|
rec_only = EXCLUDED.rec_only,
|
|
source_created_at = EXCLUDED.source_created_at,
|
|
source_updated_at = EXCLUDED.source_updated_at,
|
|
description = EXCLUDED.description,
|
|
raw_data = EXCLUDED.raw_data,
|
|
last_seen_at = NOW(),
|
|
updated_at = NOW()
|
|
RETURNING (xmax = 0) AS was_inserted
|
|
`, [
|
|
storeId,
|
|
product.external_id,
|
|
product.slug,
|
|
product.name,
|
|
product.enterprise_product_id,
|
|
product.brand,
|
|
product.brand_external_id,
|
|
product.brand_logo_url,
|
|
product.subcategory,
|
|
product.strain_type,
|
|
product.canonical_category,
|
|
product.price,
|
|
product.rec_price,
|
|
product.med_price,
|
|
product.rec_special_price,
|
|
product.med_special_price,
|
|
product.is_on_special,
|
|
product.special_name,
|
|
product.discount_percent,
|
|
product.special_data ? JSON.stringify(product.special_data) : null,
|
|
product.sku,
|
|
product.inventory_quantity,
|
|
product.inventory_available,
|
|
product.is_below_threshold,
|
|
product.status,
|
|
product.thc_percentage,
|
|
product.cbd_percentage,
|
|
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
|
product.weight_mg,
|
|
product.net_weight_value,
|
|
product.net_weight_unit,
|
|
product.options,
|
|
product.raw_options,
|
|
product.image_url,
|
|
product.additional_images,
|
|
product.is_featured,
|
|
product.medical_only,
|
|
product.rec_only,
|
|
product.source_created_at,
|
|
product.source_updated_at,
|
|
product.description,
|
|
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
|
]);
|
|
if (result.rows[0]?.was_inserted) {
|
|
inserted++;
|
|
}
|
|
else {
|
|
updated++;
|
|
}
|
|
}
|
|
await client.query('COMMIT');
|
|
}
|
|
catch (error) {
|
|
await client.query('ROLLBACK');
|
|
throw error;
|
|
}
|
|
finally {
|
|
client.release();
|
|
}
|
|
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
|
|
// Show summary stats
|
|
const stats = await pool.query(`
|
|
SELECT
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE is_on_special) as specials,
|
|
COUNT(DISTINCT brand) as brands,
|
|
COUNT(DISTINCT subcategory) as categories
|
|
FROM products WHERE store_id = $1
|
|
`, [storeId]);
|
|
console.log('\nStore summary:');
|
|
console.log(` Total products: ${stats.rows[0].total}`);
|
|
console.log(` On special: ${stats.rows[0].specials}`);
|
|
console.log(` Unique brands: ${stats.rows[0].brands}`);
|
|
console.log(` Categories: ${stats.rows[0].categories}`);
|
|
return {
|
|
success: true,
|
|
totalProducts: allProducts.length,
|
|
inserted,
|
|
updated,
|
|
};
|
|
}
|
|
finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
// Run
|
|
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
|
const storeId = parseInt(process.argv[3] || '1', 10);
|
|
console.log('='.repeat(60));
|
|
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Menu URL: ${menuUrl}`);
|
|
console.log(`Store ID: ${storeId}`);
|
|
console.log('');
|
|
scrapeAllProducts(menuUrl, storeId)
|
|
.then((result) => {
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('COMPLETE');
|
|
console.log(JSON.stringify(result, null, 2));
|
|
})
|
|
.catch((error) => {
|
|
console.error('Error:', error.message);
|
|
process.exit(1);
|
|
});
|