Remove incorrect migration 029, add snapshot architecture, improve scraper

- Delete migration 029 that was incorrectly creating duplicate dispensaries
- Add migration 028 for snapshot architecture
- Improve downloader with proxy/UA rotation
- Update scraper monitor and tools pages
- Various scraper improvements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-01 08:52:54 -07:00
parent e5b88b093c
commit 199b6a8a23
12 changed files with 760 additions and 341 deletions

View File

@@ -0,0 +1,240 @@
-- Migration 028: Snapshot Architecture
-- Implements append-only snapshots for full history tracking
-- Following the principle: "Never delete, only append observations"
-- =====================================================
-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE)
-- =====================================================
-- Product snapshots - one row per product per crawl
CREATE TABLE IF NOT EXISTS product_snapshots (
id SERIAL PRIMARY KEY,
-- Source identification
crawl_id UUID NOT NULL, -- Groups all products from same crawl run
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
-- Product identification
external_product_id VARCHAR(255), -- Dutchie/provider product ID
product_slug VARCHAR(500), -- URL slug for matching
-- Product details (as seen at crawl time)
name VARCHAR(500) NOT NULL,
brand VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing snapshot
price NUMERIC(10,2),
original_price NUMERIC(10,2),
sale_price NUMERIC(10,2),
discount_type VARCHAR(50),
discount_value VARCHAR(100),
-- Availability snapshot
availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown',
-- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown'
stock_quantity INTEGER,
-- Potency snapshot
thc_percentage NUMERIC(5,2),
cbd_percentage NUMERIC(5,2),
-- Product attributes
strain_type VARCHAR(100),
weight VARCHAR(100),
variant VARCHAR(255),
-- Rich data
description TEXT,
image_url TEXT,
effects TEXT[],
terpenes TEXT[],
-- Timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw metadata from provider
raw_data JSONB
);
-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC);
-- Brand snapshots - summary of brands seen per crawl
CREATE TABLE IF NOT EXISTS brand_snapshots (
id SERIAL PRIMARY KEY,
crawl_id UUID NOT NULL,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
brand_name VARCHAR(255) NOT NULL,
product_count INTEGER NOT NULL DEFAULT 0,
in_stock_count INTEGER NOT NULL DEFAULT 0,
-- Price range for this brand at this store at this time
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
avg_price NUMERIC(10,2),
-- Categories this brand has products in
categories TEXT[],
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id);
-- Crawl runs table - metadata about each crawl
CREATE TABLE IF NOT EXISTS crawl_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed'
-- Results
products_found INTEGER DEFAULT 0,
brands_found INTEGER DEFAULT 0,
categories_found INTEGER DEFAULT 0,
-- Errors if any
error_message TEXT,
-- Provider info
provider VARCHAR(50),
menu_url TEXT
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
-- =====================================================
-- LAYER 2: Summary/Rollup Tables (can be recalculated)
-- =====================================================
-- Daily brand summary per store
CREATE TABLE IF NOT EXISTS brand_store_day_summary (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
brand_name VARCHAR(255) NOT NULL,
summary_date DATE NOT NULL,
-- Presence
first_seen_at TIMESTAMPTZ,
last_seen_at TIMESTAMPTZ,
crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today
-- Product counts
total_skus INTEGER DEFAULT 0,
in_stock_skus INTEGER DEFAULT 0,
out_of_stock_events INTEGER DEFAULT 0,
-- Price stats
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
avg_price NUMERIC(10,2),
-- Categories
categories TEXT[],
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, brand_name, summary_date)
);
CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC);
CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC);
-- Product SKU daily summary
CREATE TABLE IF NOT EXISTS product_sku_day_summary (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
product_slug VARCHAR(500) NOT NULL,
summary_date DATE NOT NULL,
-- Latest values
name VARCHAR(500),
brand VARCHAR(255),
category VARCHAR(100),
-- Price tracking
opening_price NUMERIC(10,2), -- first price of day
closing_price NUMERIC(10,2), -- last price of day
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
price_changes INTEGER DEFAULT 0,
-- Availability
times_in_stock INTEGER DEFAULT 0,
times_out_of_stock INTEGER DEFAULT 0,
first_seen_at TIMESTAMPTZ,
last_seen_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, product_slug, summary_date)
);
CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC);
CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC);
-- =====================================================
-- VIEWS for common queries
-- =====================================================
-- Current products view (latest snapshot per product)
CREATE OR REPLACE VIEW current_products AS
SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug)
ps.*,
d.name AS dispensary_name,
COALESCE(d.dba_name, d.name) AS store_name
FROM product_snapshots ps
JOIN dispensaries d ON d.id = ps.dispensary_id
ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC;
-- Current brands per store view
CREATE OR REPLACE VIEW current_brands AS
SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name)
bs.*,
d.name AS dispensary_name,
COALESCE(d.dba_name, d.name) AS store_name
FROM brand_snapshots bs
JOIN dispensaries d ON d.id = bs.dispensary_id
WHERE bs.captured_at >= NOW() - INTERVAL '7 days'
ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC;
-- Brand coverage across stores
CREATE OR REPLACE VIEW brand_store_coverage AS
SELECT
brand_name,
COUNT(DISTINCT dispensary_id) AS store_count,
SUM(product_count) AS total_skus,
MIN(min_price) AS market_min_price,
MAX(max_price) AS market_max_price,
AVG(avg_price) AS market_avg_price,
MAX(captured_at) AS last_seen_at
FROM brand_snapshots
WHERE captured_at >= NOW() - INTERVAL '7 days'
GROUP BY brand_name;
-- Grant permissions
GRANT SELECT, INSERT ON product_snapshots TO scraper;
GRANT SELECT, INSERT ON brand_snapshots TO scraper;
GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper;
GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper;
GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper;
GRANT SELECT ON current_products TO scraper;
GRANT SELECT ON current_brands TO scraper;
GRANT SELECT ON brand_store_coverage TO scraper;

View File

@@ -1,56 +0,0 @@
-- =====================================================
-- Link Dutchie Stores to Dispensaries
-- =====================================================
-- Creates dispensary records for stores with dutchie_url that
-- don't yet have a dispensary_id, then links them.
-- Create dispensaries for unlinked stores with dutchie_url
DO $$
DECLARE
store_rec RECORD;
new_slug TEXT;
new_disp_id INTEGER;
BEGIN
FOR store_rec IN
SELECT id, name, dutchie_url
FROM stores
WHERE dutchie_url IS NOT NULL AND dispensary_id IS NULL
LOOP
-- Extract slug from dutchie_url
new_slug := regexp_replace(
regexp_replace(store_rec.dutchie_url, '^https://dutchie\.com/(embedded-menu|dispensary)/', ''),
'/.*$', ''
);
-- Insert or update dispensary
INSERT INTO dispensaries (name, slug, address, city, state, provider_type, menu_url, created_at, updated_at)
VALUES (
store_rec.name,
new_slug,
'TBD', -- Address to be filled in later
'TBD', -- City to be filled in later
'AZ', -- Default state
'dutchie',
store_rec.dutchie_url,
NOW(),
NOW()
)
ON CONFLICT (slug) DO UPDATE SET
provider_type = 'dutchie',
menu_url = EXCLUDED.menu_url,
updated_at = NOW()
RETURNING id INTO new_disp_id;
-- Link store to dispensary
UPDATE stores SET dispensary_id = new_disp_id WHERE id = store_rec.id;
RAISE NOTICE 'Linked store % (%) to dispensary %', store_rec.id, store_rec.name, new_disp_id;
END LOOP;
END $$;
-- Report on linked stores
SELECT s.id as store_id, s.name as store_name, s.dispensary_id, d.slug as disp_slug
FROM stores s
JOIN dispensaries d ON d.id = s.dispensary_id
WHERE s.dutchie_url IS NOT NULL
ORDER BY s.id;

View File

@@ -71,36 +71,46 @@ router.get('/active/:id', async (req, res) => {
// Get scraper history (last 50 completed scrapes)
router.get('/history', async (req, res) => {
try {
const { limit = 50, store_id } = req.query;
const { limit = 50, dispensary_id } = req.query;
let query = `
SELECT
s.id as store_id,
s.name as store_name,
c.id as category_id,
c.name as category_name,
c.last_scraped_at,
d.id as dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
d.city,
d.state,
dcj.id as job_id,
dcj.job_type,
dcj.status,
dcj.products_found,
dcj.products_new,
dcj.products_updated,
dcj.in_stock_count,
dcj.out_of_stock_count,
dcj.duration_ms,
dcj.completed_at as last_scraped_at,
dcj.error_message,
(
SELECT COUNT(*)
FROM products p
WHERE p.store_id = s.id
AND p.category_id = c.id
WHERE p.dispensary_id = d.id
AND p.last_seen_at >= NOW() - INTERVAL '7 days'
) as product_count
FROM stores s
LEFT JOIN categories c ON c.store_id = s.id
WHERE c.last_scraped_at IS NOT NULL
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
WHERE dcj.completed_at IS NOT NULL
`;
const params: any[] = [];
let paramCount = 1;
if (store_id) {
query += ` AND s.id = $${paramCount}`;
params.push(store_id);
if (dispensary_id) {
query += ` AND d.id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`;
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
params.push(limit);
const result = await pool.query(query, params);
@@ -169,7 +179,7 @@ export function completeScraper(id: string, error?: string): void {
}
}
// Brand scrape jobs endpoints
// Dispensary crawl jobs endpoints
router.get('/jobs/stats', async (req, res) => {
try {
const { dispensary_id } = req.query;
@@ -187,8 +197,8 @@ router.get('/jobs/stats', async (req, res) => {
status,
COUNT(*) as count,
SUM(products_found) as total_products_found,
SUM(products_saved) as total_products_saved
FROM brand_scrape_jobs
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY status
`, params);
@@ -205,8 +215,8 @@ router.get('/jobs/stats', async (req, res) => {
result.rows.forEach((row: { status: string; count: string; total_products_found?: string; total_products_saved?: string }) => {
stats[row.status as keyof typeof stats] = parseInt(row.count);
if (row.status === 'completed') {
stats.total_products_found = parseInt(row.total_products_found || '0');
stats.total_products_saved = parseInt(row.total_products_saved || '0');
stats.total_products_found += parseInt(row.total_products_found || '0');
stats.total_products_saved += parseInt(row.total_products_saved || '0');
}
});
@@ -221,31 +231,32 @@ router.get('/jobs/active', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = "WHERE status = 'in_progress'";
let whereClause = "WHERE dcj.status = 'in_progress'";
const params: any[] = [];
let paramCount = 1;
if (dispensary_id) {
whereClause += ` AND dispensary_id = $${paramCount}`;
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
const result = await pool.query(`
SELECT
id,
dispensary_id,
brand_slug,
brand_name,
status,
worker_id,
started_at,
products_found,
products_saved,
EXTRACT(EPOCH FROM (NOW() - started_at)) as duration_seconds
FROM brand_scrape_jobs
dcj.id,
dcj.dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY started_at DESC
ORDER BY dcj.started_at DESC
`, params);
res.json({ jobs: result.rows });
@@ -266,13 +277,13 @@ router.get('/jobs/recent', async (req, res) => {
const conditions: string[] = [];
if (dispensary_id) {
conditions.push(`dispensary_id = $${paramCount}`);
conditions.push(`dcj.dispensary_id = $${paramCount}`);
params.push(dispensary_id);
paramCount++;
}
if (status) {
conditions.push(`status = $${paramCount}`);
conditions.push(`dcj.status = $${paramCount}`);
params.push(status);
paramCount++;
}
@@ -285,22 +296,22 @@ router.get('/jobs/recent', async (req, res) => {
const result = await pool.query(`
SELECT
id,
dispensary_id,
brand_slug,
brand_name,
status,
worker_id,
started_at,
completed_at,
products_found,
products_saved,
error_message,
retry_count,
EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - started_at)) as duration_seconds
FROM brand_scrape_jobs
dcj.id,
dcj.dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.completed_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
dcj.error_message,
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY created_at DESC
ORDER BY dcj.created_at DESC
LIMIT $${paramCount}
`, params);
@@ -328,10 +339,10 @@ router.get('/jobs/workers', async (req, res) => {
worker_id,
COUNT(*) as active_jobs,
SUM(products_found) as total_products_found,
SUM(products_saved) as total_products_saved,
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
MIN(started_at) as earliest_start,
MAX(started_at) as latest_start
FROM brand_scrape_jobs
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY worker_id
ORDER BY worker_id

View File

@@ -3,16 +3,108 @@ import axios from 'axios';
import { ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
import { logger } from '../services/logger';
// Fingerprint profiles for randomization
const SCREEN_RESOLUTIONS = [
{ width: 1920, height: 1080 },
{ width: 1366, height: 768 },
{ width: 1536, height: 864 },
{ width: 1440, height: 900 },
{ width: 1280, height: 720 },
{ width: 2560, height: 1440 },
{ width: 1680, height: 1050 },
{ width: 1600, height: 900 },
];
const TIMEZONES = [
'America/New_York',
'America/Chicago',
'America/Denver',
'America/Los_Angeles',
'America/Phoenix',
];
const LANGUAGES = [
['en-US', 'en'],
['en-US', 'en', 'es'],
['en-US'],
];
const PLATFORMS = [
'Win32',
'MacIntel',
'Linux x86_64',
];
const WEBGL_VENDORS = [
'Google Inc. (NVIDIA)',
'Google Inc. (Intel)',
'Google Inc. (AMD)',
'Intel Inc.',
'NVIDIA Corporation',
];
const WEBGL_RENDERERS = [
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
'Intel Iris OpenGL Engine',
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
'AMD Radeon Pro 5500M OpenGL Engine',
];
interface Fingerprint {
screen: { width: number; height: number };
timezone: string;
languages: string[];
platform: string;
hardwareConcurrency: number;
deviceMemory: number;
webglVendor: string;
webglRenderer: string;
}
function generateRandomFingerprint(): Fingerprint {
return {
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
};
}
export class Downloader {
private browser: Browser | null = null;
private page: Page | null = null;
private pageInUse: boolean = false;
private currentFingerprint: Fingerprint = generateRandomFingerprint();
private needsNewFingerprint: boolean = false;
/**
* Initialize browser instance (lazy initialization)
* Force new fingerprint on next browser creation
*/
private async getBrowser(): Promise<Browser> {
rotateFingerprint(): void {
this.needsNewFingerprint = true;
logger.info('scraper', '🔄 Fingerprint rotation scheduled');
}
/**
* Initialize browser instance with fingerprint
*/
private async getBrowser(forceNew: boolean = false): Promise<Browser> {
// Create new browser if needed for fingerprint rotation
if (forceNew || this.needsNewFingerprint) {
await this.close();
this.currentFingerprint = generateRandomFingerprint();
this.needsNewFingerprint = false;
logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
}
if (!this.browser || !this.browser.isConnected()) {
const { screen } = this.currentFingerprint;
const launchOptions: any = {
headless: 'new',
args: [
@@ -20,9 +112,11 @@ export class Downloader {
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
`--window-size=${screen.width},${screen.height}`,
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
'--disable-features=IsolateOrigins,site-per-process',
'--disable-infobars',
'--disable-extensions',
]
};
@@ -34,52 +128,157 @@ export class Downloader {
}
/**
* Get or create a page instance
* Get or create a page instance with current fingerprint
*/
private async getPage(): Promise<Page> {
if (!this.page || this.page.isClosed()) {
const browser = await this.getBrowser();
private async getPage(forceNew: boolean = false): Promise<Page> {
if (!this.page || this.page.isClosed() || forceNew) {
const browser = await this.getBrowser(forceNew);
this.page = await browser.newPage();
await this.page.setViewport({ width: 1920, height: 1080 });
logger.debug('scraper', 'New page created');
const { screen } = this.currentFingerprint;
await this.page.setViewport({
width: screen.width,
height: screen.height,
deviceScaleFactor: 1,
});
// Apply fingerprint
await this.applyFingerprint(this.page);
logger.debug('scraper', 'New page created with fingerprint');
}
return this.page;
}
/**
* Apply stealth mode to page
* Apply full fingerprint to page
*/
private async makePageStealthy(page: Page): Promise<void> {
await page.evaluateOnNewDocument(() => {
// @ts-ignore - runs in browser context
private async applyFingerprint(page: Page): Promise<void> {
const fp = this.currentFingerprint;
await page.evaluateOnNewDocument((fingerprint) => {
// Hide webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
// Spoof platform
Object.defineProperty(navigator, 'platform', {
get: () => fingerprint.platform,
});
// @ts-ignore - runs in browser context
// Spoof languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
get: () => fingerprint.languages,
});
// @ts-ignore - runs in browser context
// Spoof hardware concurrency
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => fingerprint.hardwareConcurrency,
});
// Spoof device memory
Object.defineProperty(navigator, 'deviceMemory', {
get: () => fingerprint.deviceMemory,
});
// Spoof plugins (realistic count)
Object.defineProperty(navigator, 'plugins', {
get: () => {
const plugins: any = [];
for (let i = 0; i < 5; i++) {
plugins.push({
name: `Plugin ${i}`,
filename: `plugin${i}.dll`,
description: `Description ${i}`,
});
}
plugins.length = 5;
return plugins;
},
});
// Chrome object
(window as any).chrome = {
runtime: {},
loadTimes: () => ({}),
csi: () => ({}),
app: {},
};
// @ts-ignore - runs in browser context
// Permissions
const originalQuery = window.navigator.permissions.query;
// @ts-ignore - runs in browser context
window.navigator.permissions.query = (parameters: any) =>
parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' } as any)
: originalQuery(parameters);
});
// WebGL fingerprint spoofing
const getParameterProxyHandler = {
apply: function(target: any, thisArg: any, argumentsList: any) {
const param = argumentsList[0];
// UNMASKED_VENDOR_WEBGL
if (param === 37445) {
return fingerprint.webglVendor;
}
// UNMASKED_RENDERER_WEBGL
if (param === 37446) {
return fingerprint.webglRenderer;
}
return Reflect.apply(target, thisArg, argumentsList);
}
};
// Override WebGL
const originalGetContext = HTMLCanvasElement.prototype.getContext;
(HTMLCanvasElement.prototype as any).getContext = function(this: HTMLCanvasElement, type: string, ...args: any[]) {
const context = originalGetContext.call(this, type, ...args);
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
const glContext = context as WebGLRenderingContext;
const originalGetParameter = glContext.getParameter.bind(glContext);
(glContext as any).getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
}
return context;
};
// Canvas fingerprint noise
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function(type?: string) {
const context = this.getContext('2d');
if (context) {
const imageData = context.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {
// Add tiny noise to RGB values
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
}
context.putImageData(imageData, 0, 0);
}
return originalToDataURL.call(this, type);
};
// Screen dimensions
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
}, fp);
// Set timezone via CDP
const client = await page.target().createCDPSession();
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
}
/**
* Apply stealth mode to page (legacy - now uses applyFingerprint)
*/
private async makePageStealthy(page: Page): Promise<void> {
// Now handled by applyFingerprint
await this.applyFingerprint(page);
}
/**

View File

@@ -1,13 +1,32 @@
import { Middleware, ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
import { logger } from '../services/logger';
import { pool } from '../db/migrate';
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
// Diverse, realistic user agents - updated for 2024/2025
const USER_AGENTS = [
// Chrome on Windows (most common)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
// Chrome on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
];
function getRandomUserAgent(): string {
@@ -19,59 +38,90 @@ function sleep(ms: number): Promise<void> {
}
/**
* User Agent Rotation Middleware
* User Agent Rotation Middleware - rotates UA on each request for better evasion
*/
export class UserAgentMiddleware implements Middleware {
name = 'UserAgentMiddleware';
priority = 100;
private lastUserAgent: string | null = null;
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
// Always rotate UA on retries or bot detection
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.userAgent || forceRotation) {
// Get a different UA than the last one used
let newUA = getRandomUserAgent();
let attempts = 0;
while (newUA === this.lastUserAgent && attempts < 5) {
newUA = getRandomUserAgent();
attempts++;
}
request.metadata.userAgent = newUA;
this.lastUserAgent = newUA;
if (forceRotation) {
logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
}
}
return request;
}
}
/**
* Proxy Rotation Middleware
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
*/
export class ProxyMiddleware implements Middleware {
name = 'ProxyMiddleware';
priority = 90;
private async getActiveProxy(): Promise<ProxyConfig | null> {
try {
const result = await pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
} catch (error) {
logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
private currentProxyId: number | null = null;
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
// Always try to use a proxy from the central proxy service
// The service handles bot detection timeouts automatically
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.proxy || forceRotation) {
// Get proxy from central service - it handles timeouts automatically
const proxy = await getActiveProxy();
if (proxy) {
request.metadata.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol,
username: proxy.username,
password: proxy.password,
};
request.metadata.proxyId = proxy.id;
this.currentProxyId = proxy.id;
const reason = forceRotation ? 'rotation' : 'initial';
logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
} else {
logger.warn('scraper', '⚠️ No proxy available - running without proxy');
}
}
return request;
}
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
// If bot detection was triggered, put the proxy in timeout
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
putProxyInTimeout(response.request.metadata.proxyId, 'Bot detection triggered');
logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
}
return response;
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
// If bot detection error, put proxy in timeout
if (isBotDetectionError(error.message) && request.metadata.proxyId) {
putProxyInTimeout(request.metadata.proxyId, error.message);
logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
}
return error;
}
}
/**
@@ -194,7 +244,7 @@ export class RetryMiddleware implements Middleware {
}
/**
* Bot Detection Middleware
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
*/
export class BotDetectionMiddleware implements Middleware {
name = 'BotDetectionMiddleware';
@@ -203,6 +253,9 @@ export class BotDetectionMiddleware implements Middleware {
private detectedCount: number = 0;
private readonly DETECTION_THRESHOLD = 3;
// Export for use by other middlewares
static shouldRotateFingerprint: boolean = false;
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
const content = typeof response.content === 'string'
? response.content
@@ -215,17 +268,29 @@ export class BotDetectionMiddleware implements Middleware {
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
/robot/i,
/verify.*human/i,
/security check/i,
/please wait/i,
/checking your browser/i,
/ray id/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
BotDetectionMiddleware.shouldRotateFingerprint = true;
logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
// Mark the request for rotation on retry
response.request.metadata.botDetected = true;
response.request.metadata.needsNewBrowser = true;
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error: ScraperError = new Error('Bot detection threshold reached') as ScraperError;
const error: ScraperError = new Error('Bot detection threshold reached - rotating fingerprint') as ScraperError;
error.type = ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
@@ -234,10 +299,25 @@ export class BotDetectionMiddleware implements Middleware {
} else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
BotDetectionMiddleware.shouldRotateFingerprint = false;
}
return response;
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
// If bot detection error, flag for rotation and allow retry
if ('type' in error && (error as ScraperError).type === ErrorType.BOT_DETECTION) {
request.metadata.botDetected = true;
request.metadata.needsNewBrowser = true;
logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
// Add delay before retry to avoid rate limiting
await sleep(5000 + Math.random() * 5000);
return null; // Return null to trigger retry
}
return error;
}
}
/**

View File

@@ -154,6 +154,17 @@ export class ImagePipeline implements ItemPipeline<Product> {
}
}
/**
* Generate a URL-safe slug from a product name
*/
function generateSlug(name: string): string {
return name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 400);
}
/**
* Database Pipeline - saves items to database
*/
@@ -168,6 +179,10 @@ export class DatabasePipeline implements ItemPipeline<Product> {
// Extract store and category from metadata (set by spider)
const storeId = (item as any).storeId;
const categoryId = (item as any).categoryId;
const dispensaryId = (item as any).dispensaryId;
// Generate slug from name
const slug = generateSlug(item.name);
if (!storeId || !categoryId) {
logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
@@ -195,13 +210,13 @@ export class DatabasePipeline implements ItemPipeline<Product> {
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14)
WHERE id = $12
`, [
item.name, item.description, item.price,
item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {}), productId
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug
]);
logger.debug('pipeline', `Updated product: ${item.name}`);
@@ -209,13 +224,13 @@ export class DatabasePipeline implements ItemPipeline<Product> {
// Insert new product
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
store_id, category_id, dispensary_id, dutchie_product_id, slug, name, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16)
RETURNING id
`, [
storeId, categoryId, item.dutchieProductId, item.name, item.description,
storeId, categoryId, dispensaryId, item.dutchieProductId, slug, item.name, item.description,
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {})
@@ -228,12 +243,19 @@ export class DatabasePipeline implements ItemPipeline<Product> {
// Download image if needed
if (item.imageUrl && !localImagePath) {
try {
localImagePath = await uploadImageFromUrl(item.imageUrl, productId);
// Get store slug for organized image storage
const storeResult = await client.query(
'SELECT slug FROM stores WHERE id = $1',
[storeId]
);
const storeSlug = storeResult.rows[0]?.slug || undefined;
const imageSizes = await uploadImageFromUrl(item.imageUrl, productId, storeSlug);
// Use thumbnail path for local_image_path
localImagePath = imageSizes.thumbnail;
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
UPDATE products SET local_image_path = $1 WHERE id = $2
`, [imageSizes.thumbnail, productId]);
logger.debug('pipeline', `Downloaded image for: ${item.name}`);
} catch (error) {
logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);

View File

@@ -176,7 +176,7 @@ async function queueProductionCrawls(): Promise<number> {
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id]

View File

@@ -221,7 +221,7 @@ async function queueCategoryProductionCrawls(category?: IntelligenceCategory): P
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id, cat]

View File

@@ -131,7 +131,7 @@ async function getStoreIdForDispensary(dispensaryId: number): Promise<number | n
// Check if there's a stores entry linked to this dispensary
const result = await pool.query(
`SELECT s.id FROM stores s
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
WHERE d.id = $1
LIMIT 1`,
[dispensaryId]

View File

@@ -130,6 +130,11 @@ async function uploadToLocalFilesystem(
const mediumPath = `${baseFilename}-medium.png`;
const fullPath = `${baseFilename}-full.png`;
// Ensure the target directory exists (in case initializeMinio wasn't called)
// Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products')
const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename));
await fs.mkdir(targetDir, { recursive: true });
await Promise.all([
fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer),
fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer),
@@ -173,7 +178,12 @@ async function uploadToMinio(
};
}
export async function uploadImageFromUrl(imageUrl: string, productId: number, removeBackgrounds = true): Promise<ImageSizes> {
export async function uploadImageFromUrl(
imageUrl: string,
productId: number,
storeSlug?: string,
removeBackgrounds = true
): Promise<ImageSizes> {
try {
// Download image
const response = await axios.get(imageUrl, { responseType: 'arraybuffer' });
@@ -184,8 +194,9 @@ export async function uploadImageFromUrl(imageUrl: string, productId: number, re
buffer = await removeBackground(buffer);
}
// Generate unique base filename
const baseFilename = `products/${productId}-${uuidv4()}`;
// Generate unique base filename - organize by store if slug provided
const storeDir = storeSlug ? `products/${storeSlug}` : 'products';
const baseFilename = `${storeDir}/${productId}-${uuidv4()}`;
// Create multiple sizes with Sharp and convert to WebP/PNG for better compression
// Use PNG for images with transparency

View File

@@ -93,7 +93,7 @@ export function ScraperMonitor() {
marginBottom: '-2px'
}}
>
Brand Scrape Jobs
Dispensary Jobs
</button>
<button
onClick={() => setActiveTab('scrapers')}
@@ -109,7 +109,7 @@ export function ScraperMonitor() {
marginBottom: '-2px'
}}
>
Legacy Scrapers
Crawl History
</button>
</div>
@@ -232,10 +232,10 @@ export function ScraperMonitor() {
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start' }}>
<div style={{ flex: 1 }}>
<div style={{ fontSize: '18px', fontWeight: '600', marginBottom: '8px' }}>
{job.brand_name}
{job.dispensary_name || job.brand_name}
</div>
<div style={{ fontSize: '14px', color: '#666', marginBottom: '12px' }}>
Worker: {job.worker_id} | Job #{job.id}
{job.job_type || 'crawl'} | Job #{job.id}
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '12px' }}>
<div>
@@ -290,8 +290,8 @@ export function ScraperMonitor() {
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Brand</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Worker</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Type</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Saved</th>
@@ -302,8 +302,8 @@ export function ScraperMonitor() {
<tbody>
{recentJobs.map((job: any) => (
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>{job.brand_name}</td>
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.worker_id || '-'}</td>
<td style={{ padding: '15px' }}>{job.dispensary_name || job.brand_name}</td>
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.job_type || '-'}</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<span style={{
padding: '4px 10px',
@@ -481,22 +481,37 @@ export function ScraperMonitor() {
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Category</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Status</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Scraped</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Crawled</th>
</tr>
</thead>
<tbody>
{history.map((item, index) => (
<tr key={index} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>{item.store_name}</td>
<td style={{ padding: '15px' }}>{item.category_name}</td>
<td style={{ padding: '15px' }}>{item.dispensary_name || item.store_name}</td>
<td style={{ padding: '15px' }}>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
background: item.status === 'completed' ? '#d1fae5' : item.status === 'failed' ? '#fee2e2' : '#fef3c7',
color: item.status === 'completed' ? '#065f46' : item.status === 'failed' ? '#991b1b' : '#92400e'
}}>
{item.status || '-'}
</span>
</td>
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
{item.products_found || '-'}
</td>
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
{item.product_count}
</td>
<td style={{ padding: '15px', color: '#666' }}>
{new Date(item.last_scraped_at).toLocaleString()}
{item.last_scraped_at ? new Date(item.last_scraped_at).toLocaleString() : '-'}
</td>
</tr>
))}

View File

@@ -17,61 +17,61 @@ const USER_AGENTS = {
};
export function ScraperTools() {
const [stores, setStores] = useState<any[]>([]);
const [selectedStore, setSelectedStore] = useState<number | null>(null);
const [dispensaries, setDispensaries] = useState<any[]>([]);
const [selectedDispensary, setSelectedDispensary] = useState<number | null>(null);
const [parallelScrapers, setParallelScrapers] = useState(3);
const [selectedUserAgent, setSelectedUserAgent] = useState<string>('rotate-desktop');
const [scraping, setScraping] = useState(false);
const [downloadingImages, setDownloadingImages] = useState(false);
const [discoveringCategories, setDiscoveringCategories] = useState(false);
const [debugging, setDebugging] = useState(false);
const [notification, setNotification] = useState<{ message: string; type: 'success' | 'error' | 'info' } | null>(null);
const [loading, setLoading] = useState(true);
useEffect(() => {
loadStores();
loadDispensaries();
}, []);
const loadStores = async () => {
const loadDispensaries = async () => {
setLoading(true);
try {
const data = await api.getStores();
setStores(data.stores);
if (data.stores.length > 0) {
setSelectedStore(data.stores[0].id);
const data = await api.getDispensaries();
// Filter to dispensaries that have a menu_url and are scrape enabled
const scrapableDispensaries = data.dispensaries.filter((d: any) => d.menu_url && d.scrape_enabled);
setDispensaries(scrapableDispensaries);
if (scrapableDispensaries.length > 0) {
setSelectedDispensary(scrapableDispensaries[0].id);
}
} catch (error) {
console.error('Failed to load stores:', error);
console.error('Failed to load dispensaries:', error);
} finally {
setLoading(false);
}
};
const handleScrape = async () => {
if (!selectedStore || scraping) return;
if (!selectedDispensary || scraping) return;
setScraping(true);
try {
await api.scrapeStore(selectedStore, parallelScrapers, selectedUserAgent || undefined);
await api.triggerDispensaryCrawl(selectedDispensary);
setNotification({
message: `Scrape started with ${parallelScrapers} parallel scrapers using ${USER_AGENTS[selectedUserAgent as keyof typeof USER_AGENTS] || 'Random'} UA! Check the Scraper Monitor for progress.`,
message: `Crawl started for dispensary! Check the Scraper Monitor for progress.`,
type: 'success'
});
} catch (error: any) {
setNotification({ message: 'Failed to start scrape: ' + error.message, type: 'error' });
setNotification({ message: 'Failed to start crawl: ' + error.message, type: 'error' });
} finally {
setScraping(false);
}
};
const handleDownloadImages = async () => {
if (!selectedStore || downloadingImages) return;
if (!selectedDispensary || downloadingImages) return;
setDownloadingImages(true);
try {
const result = await api.downloadStoreImages(selectedStore);
// TODO: Implement dispensary image download endpoint
setNotification({
message: `Image download started! ${result.total_missing} missing images will be downloaded.`,
message: `Image download feature coming soon!`,
type: 'info'
});
} catch (error: any) {
@@ -81,35 +81,7 @@ export function ScraperTools() {
}
};
const handleDiscoverCategories = async () => {
if (!selectedStore || discoveringCategories) return;
setDiscoveringCategories(true);
try {
await api.discoverStoreCategories(selectedStore);
setNotification({ message: 'Category discovery started! Check logs for progress.', type: 'info' });
} catch (error: any) {
setNotification({ message: 'Failed to start category discovery: ' + error.message, type: 'error' });
} finally {
setDiscoveringCategories(false);
}
};
const handleDebug = async () => {
if (!selectedStore || debugging) return;
setDebugging(true);
try {
await api.debugScrapeStore(selectedStore);
setNotification({ message: 'Debug started! Check Logs page for output.', type: 'info' });
} catch (error: any) {
setNotification({ message: 'Debug failed: ' + error.message, type: 'error' });
} finally {
setDebugging(false);
}
};
const selectedStoreData = stores.find(s => s.id === selectedStore);
const selectedDispensaryData = dispensaries.find(d => d.id === selectedDispensary);
if (loading) {
return (
@@ -133,32 +105,32 @@ export function ScraperTools() {
<div className="space-y-6">
<div>
<h1 className="text-3xl font-bold">Scraper Tools</h1>
<p className="text-gray-500 mt-2">Manage scraping operations for your stores</p>
<p className="text-gray-500 mt-2">Manage crawling operations for dispensaries</p>
</div>
{/* Store Selection */}
{/* Dispensary Selection */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Select Store</h2>
<h2 className="card-title">Select Dispensary</h2>
<select
className="select select-bordered w-full max-w-md"
value={selectedStore || ''}
onChange={(e) => setSelectedStore(parseInt(e.target.value))}
value={selectedDispensary || ''}
onChange={(e) => setSelectedDispensary(parseInt(e.target.value))}
>
{stores.map(store => (
<option key={store.id} value={store.id}>
{store.name} ({store.product_count || 0} products)
{dispensaries.map(disp => (
<option key={disp.id} value={disp.id}>
{disp.dba_name || disp.name} - {disp.city}, {disp.state}
</option>
))}
</select>
{selectedStoreData && (
{selectedDispensaryData && (
<div className="mt-4 p-4 bg-base-200 rounded-lg">
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
<div>
<div className="text-gray-500">Status</div>
<div className="font-semibold">
{selectedStoreData.scrape_enabled ? (
{selectedDispensaryData.scrape_enabled ? (
<span className="badge badge-success">Enabled</span>
) : (
<span className="badge badge-error">Disabled</span>
@@ -166,18 +138,18 @@ export function ScraperTools() {
</div>
</div>
<div>
<div className="text-gray-500">Categories</div>
<div className="font-semibold">{selectedStoreData.category_count || 0}</div>
<div className="text-gray-500">Provider</div>
<div className="font-semibold">{selectedDispensaryData.provider_type || 'Unknown'}</div>
</div>
<div>
<div className="text-gray-500">Products</div>
<div className="font-semibold">{selectedStoreData.product_count || 0}</div>
<div className="font-semibold">{selectedDispensaryData.product_count || 0}</div>
</div>
<div>
<div className="text-gray-500">Last Scraped</div>
<div className="text-gray-500">Last Crawled</div>
<div className="font-semibold">
{selectedStoreData.last_scraped_at
? new Date(selectedStoreData.last_scraped_at).toLocaleDateString()
{selectedDispensaryData.last_crawl_at
? new Date(selectedDispensaryData.last_crawl_at).toLocaleDateString()
: 'Never'}
</div>
</div>
@@ -189,56 +161,21 @@ export function ScraperTools() {
{/* Scraper Actions */}
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
{/* Scrape Now */}
{/* Crawl Now */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Scrape Store</h2>
<h2 className="card-title">Crawl Dispensary</h2>
<p className="text-sm text-gray-500">
Start scraping products from the selected store
Start crawling products from the selected dispensary menu
</p>
<div className="form-control w-full mt-4">
<label className="label">
<span className="label-text">Parallel Scrapers</span>
</label>
<input
type="number"
min="1"
max="10"
value={parallelScrapers}
onChange={(e) => setParallelScrapers(parseInt(e.target.value) || 3)}
className="input input-bordered w-full"
/>
<label className="label">
<span className="label-text-alt">Number of concurrent scraping processes (1-10)</span>
</label>
</div>
<div className="form-control w-full mt-4">
<label className="label">
<span className="label-text">User Agent</span>
</label>
<select
className="select select-bordered w-full"
value={selectedUserAgent}
onChange={(e) => setSelectedUserAgent(e.target.value)}
>
{Object.entries(USER_AGENTS).map(([key, label]) => (
<option key={key} value={key}>{label}</option>
))}
</select>
<label className="label">
<span className="label-text-alt">Browser/bot identity for scraping session</span>
</label>
</div>
<div className="card-actions justify-end mt-4">
<button
onClick={handleScrape}
disabled={!selectedStore || scraping}
disabled={!selectedDispensary || scraping}
className={`btn btn-primary ${scraping ? 'loading' : ''}`}
>
{scraping ? 'Scraping...' : 'Start Scrape'}
{scraping ? 'Starting...' : 'Start Crawl'}
</button>
</div>
</div>
@@ -249,13 +186,13 @@ export function ScraperTools() {
<div className="card-body">
<h2 className="card-title">Download Images</h2>
<p className="text-sm text-gray-500">
Download missing product images for the selected store
Download missing product images for the selected dispensary
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDownloadImages}
disabled={!selectedStore || downloadingImages}
disabled={!selectedDispensary || downloadingImages}
className={`btn btn-secondary ${downloadingImages ? 'loading' : ''}`}
>
{downloadingImages ? 'Downloading...' : 'Download Missing Images'}
@@ -263,46 +200,6 @@ export function ScraperTools() {
</div>
</div>
</div>
{/* Discover Categories */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Discover Categories</h2>
<p className="text-sm text-gray-500">
Automatically discover and create categories from the store
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDiscoverCategories}
disabled={!selectedStore || discoveringCategories}
className={`btn btn-accent ${discoveringCategories ? 'loading' : ''}`}
>
{discoveringCategories ? 'Discovering...' : 'Discover Categories'}
</button>
</div>
</div>
</div>
{/* Debug Scraper */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Debug Scraper</h2>
<p className="text-sm text-gray-500">
Run scraper in debug mode and view detailed logs
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDebug}
disabled={!selectedStore || debugging}
className={`btn btn-warning ${debugging ? 'loading' : ''}`}
>
{debugging ? 'Debugging...' : 'Start Debug'}
</button>
</div>
</div>
</div>
</div>
{/* Quick Links */}