The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
247 lines
11 KiB
JavaScript
247 lines
11 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.discoverCategories = discoverCategories;
|
|
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
const migrate_1 = require("../db/migrate");
|
|
const logger_1 = require("./logger");
|
|
const age_gate_1 = require("../utils/age-gate");
|
|
const dutchie_1 = require("../scrapers/templates/dutchie");
|
|
// Apply stealth plugin
|
|
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
const DUTCHIE_CATEGORIES = [
|
|
{ name: 'Shop', slug: 'shop' },
|
|
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
|
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
|
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
|
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
|
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
|
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
|
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
|
|
{ name: 'Brands', slug: 'brands' },
|
|
{ name: 'Specials', slug: 'specials' }
|
|
];
|
|
const CURALEAF_CATEGORIES = [
|
|
{ name: 'Shop', slug: 'shop' },
|
|
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
|
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
|
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
|
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
|
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
|
{ name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
|
|
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
|
{ name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
|
|
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
|
|
];
|
|
async function makePageStealthy(page) {
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
window.chrome = { runtime: {} };
|
|
});
|
|
}
|
|
async function isDutchieMenu(page) {
|
|
try {
|
|
// Check page source for Dutchie markers
|
|
const isDutchie = await page.evaluate(() => {
|
|
// Check for window.reactEnv with dutchie URLs
|
|
if (window.reactEnv) {
|
|
const env = window.reactEnv;
|
|
if (env.adminUrl?.includes('dutchie.com') ||
|
|
env.apiUrl?.includes('dutchie.com') ||
|
|
env.consumerUrl?.includes('dutchie.com')) {
|
|
return true;
|
|
}
|
|
}
|
|
// Check HTML source for dutchie references
|
|
const htmlContent = document.documentElement.innerHTML;
|
|
if (htmlContent.includes('admin.dutchie.com') ||
|
|
htmlContent.includes('api.dutchie.com') ||
|
|
htmlContent.includes('embedded-menu') ||
|
|
htmlContent.includes('window.reactEnv')) {
|
|
return true;
|
|
}
|
|
return false;
|
|
});
|
|
return isDutchie;
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.warn('categories', `Error detecting Dutchie menu: ${error}`);
|
|
return false;
|
|
}
|
|
}
|
|
async function discoverCategories(storeId) {
|
|
let browser = null;
|
|
try {
|
|
logger_1.logger.info('categories', `Discovering categories for store ID: ${storeId}`);
|
|
const storeResult = await migrate_1.pool.query(`
|
|
SELECT id, name, slug, dutchie_url
|
|
FROM stores
|
|
WHERE id = $1
|
|
`, [storeId]);
|
|
if (storeResult.rows.length === 0) {
|
|
throw new Error('Store not found');
|
|
}
|
|
const store = storeResult.rows[0];
|
|
const baseUrl = store.dutchie_url;
|
|
// Launch browser to check page source
|
|
browser = await puppeteer_extra_1.default.launch({
|
|
headless: 'new',
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled'
|
|
]
|
|
});
|
|
const page = await browser.newPage();
|
|
await makePageStealthy(page);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
|
const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
|
|
await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
|
|
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
|
|
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
await page.waitForTimeout(3000);
|
|
// If age gate still appears, try to bypass it
|
|
await (0, age_gate_1.bypassAgeGate)(page, state);
|
|
// Detect if it's a Dutchie menu by inspecting page source
|
|
const isDutchie = await isDutchieMenu(page);
|
|
await browser.close();
|
|
browser = null;
|
|
if (isDutchie) {
|
|
logger_1.logger.info('categories', `✅ Detected Dutchie menu for ${store.name}`);
|
|
await createDutchieCategories(storeId, store);
|
|
}
|
|
else {
|
|
// Fallback: Use standard cannabis categories for non-Dutchie sites
|
|
logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
|
|
await createCuraleafCategories(storeId, store);
|
|
}
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('categories', `Category discovery error: ${error}`);
|
|
if (browser)
|
|
await browser.close();
|
|
throw error;
|
|
}
|
|
}
|
|
async function createDutchieCategories(storeId, store) {
|
|
const client = await migrate_1.pool.connect();
|
|
try {
|
|
await client.query('BEGIN');
|
|
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
|
|
const baseUrl = store.dutchie_url;
|
|
for (const category of DUTCHIE_CATEGORIES) {
|
|
let categoryUrl;
|
|
// Use Dutchie template to build correct category URLs
|
|
if (category.parentSlug) {
|
|
// Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
|
|
categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
|
|
}
|
|
else {
|
|
// Top-level: Use base URL with slug
|
|
categoryUrl = `${baseUrl}/${category.slug}`;
|
|
}
|
|
if (!category.parentSlug) {
|
|
// Create parent category
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
|
VALUES ($1, $2, $3, $4, true)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4
|
|
RETURNING id
|
|
`, [storeId, category.name, category.slug, categoryUrl]);
|
|
logger_1.logger.info('categories', `📁 ${category.name}`);
|
|
}
|
|
else {
|
|
// Create subcategory
|
|
const parentResult = await client.query(`
|
|
SELECT id FROM categories
|
|
WHERE store_id = $1 AND slug = $2
|
|
`, [storeId, category.parentSlug]);
|
|
if (parentResult.rows.length > 0) {
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
|
VALUES ($1, $2, $3, $4, true)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4
|
|
`, [storeId, category.name, category.slug, categoryUrl]);
|
|
logger_1.logger.info('categories', ` └── ${category.name}`);
|
|
}
|
|
}
|
|
}
|
|
await client.query('COMMIT');
|
|
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
|
|
}
|
|
catch (error) {
|
|
await client.query('ROLLBACK');
|
|
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
|
|
throw error;
|
|
}
|
|
finally {
|
|
client.release();
|
|
}
|
|
}
|
|
async function createCuraleafCategories(storeId, store) {
|
|
const client = await migrate_1.pool.connect();
|
|
try {
|
|
await client.query('BEGIN');
|
|
logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
|
|
const baseUrl = store.dutchie_url;
|
|
for (const category of CURALEAF_CATEGORIES) {
|
|
let categoryUrl;
|
|
if (category.parentSlug) {
|
|
// Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
|
|
categoryUrl = `${baseUrl}?category=${category.slug}`;
|
|
}
|
|
else {
|
|
// Top-level category
|
|
categoryUrl = baseUrl;
|
|
}
|
|
if (!category.parentSlug) {
|
|
// Create parent category
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
|
VALUES ($1, $2, $3, $4, true)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4
|
|
RETURNING id
|
|
`, [storeId, category.name, category.slug, categoryUrl]);
|
|
logger_1.logger.info('categories', `📁 ${category.name}`);
|
|
}
|
|
else {
|
|
// Create subcategory
|
|
const parentResult = await client.query(`
|
|
SELECT id FROM categories
|
|
WHERE store_id = $1 AND slug = $2
|
|
`, [storeId, category.parentSlug]);
|
|
if (parentResult.rows.length > 0) {
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
|
VALUES ($1, $2, $3, $4, true)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4
|
|
`, [storeId, category.name, category.slug, categoryUrl]);
|
|
logger_1.logger.info('categories', ` └── ${category.name}`);
|
|
}
|
|
}
|
|
}
|
|
await client.query('COMMIT');
|
|
logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
|
|
}
|
|
catch (error) {
|
|
await client.query('ROLLBACK');
|
|
logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
|
|
throw error;
|
|
}
|
|
finally {
|
|
client.release();
|
|
}
|
|
}
|