Remove incorrect migration 029, add snapshot architecture, improve scraper
- Delete migration 029 that was incorrectly creating duplicate dispensaries - Add migration 028 for snapshot architecture - Improve downloader with proxy/UA rotation - Update scraper monitor and tools pages - Various scraper improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
240
backend/migrations/028_snapshot_architecture.sql
Normal file
240
backend/migrations/028_snapshot_architecture.sql
Normal file
@@ -0,0 +1,240 @@
|
||||
-- Migration 028: Snapshot Architecture
|
||||
-- Implements append-only snapshots for full history tracking
|
||||
-- Following the principle: "Never delete, only append observations"
|
||||
|
||||
-- =====================================================
|
||||
-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE)
|
||||
-- =====================================================
|
||||
|
||||
-- Product snapshots - one row per product per crawl
|
||||
CREATE TABLE IF NOT EXISTS product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Source identification
|
||||
crawl_id UUID NOT NULL, -- Groups all products from same crawl run
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
-- Product identification
|
||||
external_product_id VARCHAR(255), -- Dutchie/provider product ID
|
||||
product_slug VARCHAR(500), -- URL slug for matching
|
||||
|
||||
-- Product details (as seen at crawl time)
|
||||
name VARCHAR(500) NOT NULL,
|
||||
brand VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
|
||||
-- Pricing snapshot
|
||||
price NUMERIC(10,2),
|
||||
original_price NUMERIC(10,2),
|
||||
sale_price NUMERIC(10,2),
|
||||
discount_type VARCHAR(50),
|
||||
discount_value VARCHAR(100),
|
||||
|
||||
-- Availability snapshot
|
||||
availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown',
|
||||
-- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown'
|
||||
stock_quantity INTEGER,
|
||||
|
||||
-- Potency snapshot
|
||||
thc_percentage NUMERIC(5,2),
|
||||
cbd_percentage NUMERIC(5,2),
|
||||
|
||||
-- Product attributes
|
||||
strain_type VARCHAR(100),
|
||||
weight VARCHAR(100),
|
||||
variant VARCHAR(255),
|
||||
|
||||
-- Rich data
|
||||
description TEXT,
|
||||
image_url TEXT,
|
||||
effects TEXT[],
|
||||
terpenes TEXT[],
|
||||
|
||||
-- Timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Raw metadata from provider
|
||||
raw_data JSONB
|
||||
);
|
||||
|
||||
-- Indexes for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC);
|
||||
|
||||
-- Brand snapshots - summary of brands seen per crawl
|
||||
CREATE TABLE IF NOT EXISTS brand_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
crawl_id UUID NOT NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
brand_name VARCHAR(255) NOT NULL,
|
||||
product_count INTEGER NOT NULL DEFAULT 0,
|
||||
in_stock_count INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Price range for this brand at this store at this time
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
avg_price NUMERIC(10,2),
|
||||
|
||||
-- Categories this brand has products in
|
||||
categories TEXT[],
|
||||
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id);
|
||||
|
||||
-- Crawl runs table - metadata about each crawl
|
||||
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed'
|
||||
|
||||
-- Results
|
||||
products_found INTEGER DEFAULT 0,
|
||||
brands_found INTEGER DEFAULT 0,
|
||||
categories_found INTEGER DEFAULT 0,
|
||||
|
||||
-- Errors if any
|
||||
error_message TEXT,
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50),
|
||||
menu_url TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||
|
||||
-- =====================================================
|
||||
-- LAYER 2: Summary/Rollup Tables (can be recalculated)
|
||||
-- =====================================================
|
||||
|
||||
-- Daily brand summary per store
|
||||
CREATE TABLE IF NOT EXISTS brand_store_day_summary (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
brand_name VARCHAR(255) NOT NULL,
|
||||
summary_date DATE NOT NULL,
|
||||
|
||||
-- Presence
|
||||
first_seen_at TIMESTAMPTZ,
|
||||
last_seen_at TIMESTAMPTZ,
|
||||
crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today
|
||||
|
||||
-- Product counts
|
||||
total_skus INTEGER DEFAULT 0,
|
||||
in_stock_skus INTEGER DEFAULT 0,
|
||||
out_of_stock_events INTEGER DEFAULT 0,
|
||||
|
||||
-- Price stats
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
avg_price NUMERIC(10,2),
|
||||
|
||||
-- Categories
|
||||
categories TEXT[],
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, brand_name, summary_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC);
|
||||
|
||||
-- Product SKU daily summary
|
||||
CREATE TABLE IF NOT EXISTS product_sku_day_summary (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
product_slug VARCHAR(500) NOT NULL,
|
||||
summary_date DATE NOT NULL,
|
||||
|
||||
-- Latest values
|
||||
name VARCHAR(500),
|
||||
brand VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
|
||||
-- Price tracking
|
||||
opening_price NUMERIC(10,2), -- first price of day
|
||||
closing_price NUMERIC(10,2), -- last price of day
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
price_changes INTEGER DEFAULT 0,
|
||||
|
||||
-- Availability
|
||||
times_in_stock INTEGER DEFAULT 0,
|
||||
times_out_of_stock INTEGER DEFAULT 0,
|
||||
first_seen_at TIMESTAMPTZ,
|
||||
last_seen_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, product_slug, summary_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC);
|
||||
|
||||
-- =====================================================
|
||||
-- VIEWS for common queries
|
||||
-- =====================================================
|
||||
|
||||
-- Current products view (latest snapshot per product)
|
||||
CREATE OR REPLACE VIEW current_products AS
|
||||
SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug)
|
||||
ps.*,
|
||||
d.name AS dispensary_name,
|
||||
COALESCE(d.dba_name, d.name) AS store_name
|
||||
FROM product_snapshots ps
|
||||
JOIN dispensaries d ON d.id = ps.dispensary_id
|
||||
ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC;
|
||||
|
||||
-- Current brands per store view
|
||||
CREATE OR REPLACE VIEW current_brands AS
|
||||
SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name)
|
||||
bs.*,
|
||||
d.name AS dispensary_name,
|
||||
COALESCE(d.dba_name, d.name) AS store_name
|
||||
FROM brand_snapshots bs
|
||||
JOIN dispensaries d ON d.id = bs.dispensary_id
|
||||
WHERE bs.captured_at >= NOW() - INTERVAL '7 days'
|
||||
ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC;
|
||||
|
||||
-- Brand coverage across stores
|
||||
CREATE OR REPLACE VIEW brand_store_coverage AS
|
||||
SELECT
|
||||
brand_name,
|
||||
COUNT(DISTINCT dispensary_id) AS store_count,
|
||||
SUM(product_count) AS total_skus,
|
||||
MIN(min_price) AS market_min_price,
|
||||
MAX(max_price) AS market_max_price,
|
||||
AVG(avg_price) AS market_avg_price,
|
||||
MAX(captured_at) AS last_seen_at
|
||||
FROM brand_snapshots
|
||||
WHERE captured_at >= NOW() - INTERVAL '7 days'
|
||||
GROUP BY brand_name;
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT, INSERT ON product_snapshots TO scraper;
|
||||
GRANT SELECT, INSERT ON brand_snapshots TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper;
|
||||
GRANT SELECT ON current_products TO scraper;
|
||||
GRANT SELECT ON current_brands TO scraper;
|
||||
GRANT SELECT ON brand_store_coverage TO scraper;
|
||||
@@ -1,56 +0,0 @@
|
||||
-- =====================================================
|
||||
-- Link Dutchie Stores to Dispensaries
|
||||
-- =====================================================
|
||||
-- Creates dispensary records for stores with dutchie_url that
|
||||
-- don't yet have a dispensary_id, then links them.
|
||||
|
||||
-- Create dispensaries for unlinked stores with dutchie_url
|
||||
DO $$
|
||||
DECLARE
|
||||
store_rec RECORD;
|
||||
new_slug TEXT;
|
||||
new_disp_id INTEGER;
|
||||
BEGIN
|
||||
FOR store_rec IN
|
||||
SELECT id, name, dutchie_url
|
||||
FROM stores
|
||||
WHERE dutchie_url IS NOT NULL AND dispensary_id IS NULL
|
||||
LOOP
|
||||
-- Extract slug from dutchie_url
|
||||
new_slug := regexp_replace(
|
||||
regexp_replace(store_rec.dutchie_url, '^https://dutchie\.com/(embedded-menu|dispensary)/', ''),
|
||||
'/.*$', ''
|
||||
);
|
||||
|
||||
-- Insert or update dispensary
|
||||
INSERT INTO dispensaries (name, slug, address, city, state, provider_type, menu_url, created_at, updated_at)
|
||||
VALUES (
|
||||
store_rec.name,
|
||||
new_slug,
|
||||
'TBD', -- Address to be filled in later
|
||||
'TBD', -- City to be filled in later
|
||||
'AZ', -- Default state
|
||||
'dutchie',
|
||||
store_rec.dutchie_url,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (slug) DO UPDATE SET
|
||||
provider_type = 'dutchie',
|
||||
menu_url = EXCLUDED.menu_url,
|
||||
updated_at = NOW()
|
||||
RETURNING id INTO new_disp_id;
|
||||
|
||||
-- Link store to dispensary
|
||||
UPDATE stores SET dispensary_id = new_disp_id WHERE id = store_rec.id;
|
||||
|
||||
RAISE NOTICE 'Linked store % (%) to dispensary %', store_rec.id, store_rec.name, new_disp_id;
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
-- Report on linked stores
|
||||
SELECT s.id as store_id, s.name as store_name, s.dispensary_id, d.slug as disp_slug
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
WHERE s.dutchie_url IS NOT NULL
|
||||
ORDER BY s.id;
|
||||
@@ -71,36 +71,46 @@ router.get('/active/:id', async (req, res) => {
|
||||
// Get scraper history (last 50 completed scrapes)
|
||||
router.get('/history', async (req, res) => {
|
||||
try {
|
||||
const { limit = 50, store_id } = req.query;
|
||||
const { limit = 50, dispensary_id } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT
|
||||
s.id as store_id,
|
||||
s.name as store_name,
|
||||
c.id as category_id,
|
||||
c.name as category_name,
|
||||
c.last_scraped_at,
|
||||
d.id as dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
dcj.id as job_id,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.products_found,
|
||||
dcj.products_new,
|
||||
dcj.products_updated,
|
||||
dcj.in_stock_count,
|
||||
dcj.out_of_stock_count,
|
||||
dcj.duration_ms,
|
||||
dcj.completed_at as last_scraped_at,
|
||||
dcj.error_message,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM products p
|
||||
WHERE p.store_id = s.id
|
||||
AND p.category_id = c.id
|
||||
WHERE p.dispensary_id = d.id
|
||||
AND p.last_seen_at >= NOW() - INTERVAL '7 days'
|
||||
) as product_count
|
||||
FROM stores s
|
||||
LEFT JOIN categories c ON c.store_id = s.id
|
||||
WHERE c.last_scraped_at IS NOT NULL
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
WHERE dcj.completed_at IS NOT NULL
|
||||
`;
|
||||
|
||||
const params: any[] = [];
|
||||
let paramCount = 1;
|
||||
|
||||
if (store_id) {
|
||||
query += ` AND s.id = $${paramCount}`;
|
||||
params.push(store_id);
|
||||
if (dispensary_id) {
|
||||
query += ` AND d.id = $${paramCount}`;
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
|
||||
query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`;
|
||||
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
@@ -169,7 +179,7 @@ export function completeScraper(id: string, error?: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
// Brand scrape jobs endpoints
|
||||
// Dispensary crawl jobs endpoints
|
||||
router.get('/jobs/stats', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_id } = req.query;
|
||||
@@ -187,8 +197,8 @@ router.get('/jobs/stats', async (req, res) => {
|
||||
status,
|
||||
COUNT(*) as count,
|
||||
SUM(products_found) as total_products_found,
|
||||
SUM(products_saved) as total_products_saved
|
||||
FROM brand_scrape_jobs
|
||||
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
|
||||
FROM dispensary_crawl_jobs
|
||||
${whereClause}
|
||||
GROUP BY status
|
||||
`, params);
|
||||
@@ -205,8 +215,8 @@ router.get('/jobs/stats', async (req, res) => {
|
||||
result.rows.forEach((row: { status: string; count: string; total_products_found?: string; total_products_saved?: string }) => {
|
||||
stats[row.status as keyof typeof stats] = parseInt(row.count);
|
||||
if (row.status === 'completed') {
|
||||
stats.total_products_found = parseInt(row.total_products_found || '0');
|
||||
stats.total_products_saved = parseInt(row.total_products_saved || '0');
|
||||
stats.total_products_found += parseInt(row.total_products_found || '0');
|
||||
stats.total_products_saved += parseInt(row.total_products_saved || '0');
|
||||
}
|
||||
});
|
||||
|
||||
@@ -221,31 +231,32 @@ router.get('/jobs/active', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_id } = req.query;
|
||||
|
||||
let whereClause = "WHERE status = 'in_progress'";
|
||||
let whereClause = "WHERE dcj.status = 'in_progress'";
|
||||
const params: any[] = [];
|
||||
let paramCount = 1;
|
||||
|
||||
if (dispensary_id) {
|
||||
whereClause += ` AND dispensary_id = $${paramCount}`;
|
||||
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
dispensary_id,
|
||||
brand_slug,
|
||||
brand_name,
|
||||
status,
|
||||
worker_id,
|
||||
started_at,
|
||||
products_found,
|
||||
products_saved,
|
||||
EXTRACT(EPOCH FROM (NOW() - started_at)) as duration_seconds
|
||||
FROM brand_scrape_jobs
|
||||
dcj.id,
|
||||
dcj.dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.worker_id,
|
||||
dcj.started_at,
|
||||
dcj.products_found,
|
||||
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
||||
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
${whereClause}
|
||||
ORDER BY started_at DESC
|
||||
ORDER BY dcj.started_at DESC
|
||||
`, params);
|
||||
|
||||
res.json({ jobs: result.rows });
|
||||
@@ -266,13 +277,13 @@ router.get('/jobs/recent', async (req, res) => {
|
||||
const conditions: string[] = [];
|
||||
|
||||
if (dispensary_id) {
|
||||
conditions.push(`dispensary_id = $${paramCount}`);
|
||||
conditions.push(`dcj.dispensary_id = $${paramCount}`);
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
|
||||
if (status) {
|
||||
conditions.push(`status = $${paramCount}`);
|
||||
conditions.push(`dcj.status = $${paramCount}`);
|
||||
params.push(status);
|
||||
paramCount++;
|
||||
}
|
||||
@@ -285,22 +296,22 @@ router.get('/jobs/recent', async (req, res) => {
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
dispensary_id,
|
||||
brand_slug,
|
||||
brand_name,
|
||||
status,
|
||||
worker_id,
|
||||
started_at,
|
||||
completed_at,
|
||||
products_found,
|
||||
products_saved,
|
||||
error_message,
|
||||
retry_count,
|
||||
EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - started_at)) as duration_seconds
|
||||
FROM brand_scrape_jobs
|
||||
dcj.id,
|
||||
dcj.dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.worker_id,
|
||||
dcj.started_at,
|
||||
dcj.completed_at,
|
||||
dcj.products_found,
|
||||
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
||||
dcj.error_message,
|
||||
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
${whereClause}
|
||||
ORDER BY created_at DESC
|
||||
ORDER BY dcj.created_at DESC
|
||||
LIMIT $${paramCount}
|
||||
`, params);
|
||||
|
||||
@@ -328,10 +339,10 @@ router.get('/jobs/workers', async (req, res) => {
|
||||
worker_id,
|
||||
COUNT(*) as active_jobs,
|
||||
SUM(products_found) as total_products_found,
|
||||
SUM(products_saved) as total_products_saved,
|
||||
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
|
||||
MIN(started_at) as earliest_start,
|
||||
MAX(started_at) as latest_start
|
||||
FROM brand_scrape_jobs
|
||||
FROM dispensary_crawl_jobs
|
||||
${whereClause}
|
||||
GROUP BY worker_id
|
||||
ORDER BY worker_id
|
||||
|
||||
@@ -3,16 +3,108 @@ import axios from 'axios';
|
||||
import { ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
|
||||
import { logger } from '../services/logger';
|
||||
|
||||
// Fingerprint profiles for randomization
|
||||
const SCREEN_RESOLUTIONS = [
|
||||
{ width: 1920, height: 1080 },
|
||||
{ width: 1366, height: 768 },
|
||||
{ width: 1536, height: 864 },
|
||||
{ width: 1440, height: 900 },
|
||||
{ width: 1280, height: 720 },
|
||||
{ width: 2560, height: 1440 },
|
||||
{ width: 1680, height: 1050 },
|
||||
{ width: 1600, height: 900 },
|
||||
];
|
||||
|
||||
const TIMEZONES = [
|
||||
'America/New_York',
|
||||
'America/Chicago',
|
||||
'America/Denver',
|
||||
'America/Los_Angeles',
|
||||
'America/Phoenix',
|
||||
];
|
||||
|
||||
const LANGUAGES = [
|
||||
['en-US', 'en'],
|
||||
['en-US', 'en', 'es'],
|
||||
['en-US'],
|
||||
];
|
||||
|
||||
const PLATFORMS = [
|
||||
'Win32',
|
||||
'MacIntel',
|
||||
'Linux x86_64',
|
||||
];
|
||||
|
||||
const WEBGL_VENDORS = [
|
||||
'Google Inc. (NVIDIA)',
|
||||
'Google Inc. (Intel)',
|
||||
'Google Inc. (AMD)',
|
||||
'Intel Inc.',
|
||||
'NVIDIA Corporation',
|
||||
];
|
||||
|
||||
const WEBGL_RENDERERS = [
|
||||
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
|
||||
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
|
||||
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
|
||||
'Intel Iris OpenGL Engine',
|
||||
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
|
||||
'AMD Radeon Pro 5500M OpenGL Engine',
|
||||
];
|
||||
|
||||
interface Fingerprint {
|
||||
screen: { width: number; height: number };
|
||||
timezone: string;
|
||||
languages: string[];
|
||||
platform: string;
|
||||
hardwareConcurrency: number;
|
||||
deviceMemory: number;
|
||||
webglVendor: string;
|
||||
webglRenderer: string;
|
||||
}
|
||||
|
||||
function generateRandomFingerprint(): Fingerprint {
|
||||
return {
|
||||
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
|
||||
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
|
||||
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
|
||||
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
|
||||
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
|
||||
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
|
||||
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
|
||||
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
|
||||
};
|
||||
}
|
||||
|
||||
export class Downloader {
|
||||
private browser: Browser | null = null;
|
||||
private page: Page | null = null;
|
||||
private pageInUse: boolean = false;
|
||||
private currentFingerprint: Fingerprint = generateRandomFingerprint();
|
||||
private needsNewFingerprint: boolean = false;
|
||||
|
||||
/**
|
||||
* Initialize browser instance (lazy initialization)
|
||||
* Force new fingerprint on next browser creation
|
||||
*/
|
||||
private async getBrowser(): Promise<Browser> {
|
||||
rotateFingerprint(): void {
|
||||
this.needsNewFingerprint = true;
|
||||
logger.info('scraper', '🔄 Fingerprint rotation scheduled');
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize browser instance with fingerprint
|
||||
*/
|
||||
private async getBrowser(forceNew: boolean = false): Promise<Browser> {
|
||||
// Create new browser if needed for fingerprint rotation
|
||||
if (forceNew || this.needsNewFingerprint) {
|
||||
await this.close();
|
||||
this.currentFingerprint = generateRandomFingerprint();
|
||||
this.needsNewFingerprint = false;
|
||||
logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
|
||||
}
|
||||
|
||||
if (!this.browser || !this.browser.isConnected()) {
|
||||
const { screen } = this.currentFingerprint;
|
||||
const launchOptions: any = {
|
||||
headless: 'new',
|
||||
args: [
|
||||
@@ -20,9 +112,11 @@ export class Downloader {
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
`--window-size=${screen.width},${screen.height}`,
|
||||
'--disable-web-security',
|
||||
'--disable-features=IsolateOrigins,site-per-process'
|
||||
'--disable-features=IsolateOrigins,site-per-process',
|
||||
'--disable-infobars',
|
||||
'--disable-extensions',
|
||||
]
|
||||
};
|
||||
|
||||
@@ -34,52 +128,157 @@ export class Downloader {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get or create a page instance
|
||||
* Get or create a page instance with current fingerprint
|
||||
*/
|
||||
private async getPage(): Promise<Page> {
|
||||
if (!this.page || this.page.isClosed()) {
|
||||
const browser = await this.getBrowser();
|
||||
private async getPage(forceNew: boolean = false): Promise<Page> {
|
||||
if (!this.page || this.page.isClosed() || forceNew) {
|
||||
const browser = await this.getBrowser(forceNew);
|
||||
this.page = await browser.newPage();
|
||||
await this.page.setViewport({ width: 1920, height: 1080 });
|
||||
logger.debug('scraper', 'New page created');
|
||||
|
||||
const { screen } = this.currentFingerprint;
|
||||
await this.page.setViewport({
|
||||
width: screen.width,
|
||||
height: screen.height,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
|
||||
// Apply fingerprint
|
||||
await this.applyFingerprint(this.page);
|
||||
logger.debug('scraper', 'New page created with fingerprint');
|
||||
}
|
||||
|
||||
return this.page;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply stealth mode to page
|
||||
* Apply full fingerprint to page
|
||||
*/
|
||||
private async makePageStealthy(page: Page): Promise<void> {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// @ts-ignore - runs in browser context
|
||||
private async applyFingerprint(page: Page): Promise<void> {
|
||||
const fp = this.currentFingerprint;
|
||||
|
||||
await page.evaluateOnNewDocument((fingerprint) => {
|
||||
// Hide webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => false,
|
||||
});
|
||||
|
||||
// @ts-ignore - runs in browser context
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
// Spoof platform
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => fingerprint.platform,
|
||||
});
|
||||
|
||||
// @ts-ignore - runs in browser context
|
||||
// Spoof languages
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en'],
|
||||
get: () => fingerprint.languages,
|
||||
});
|
||||
|
||||
// @ts-ignore - runs in browser context
|
||||
// Spoof hardware concurrency
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||
get: () => fingerprint.hardwareConcurrency,
|
||||
});
|
||||
|
||||
// Spoof device memory
|
||||
Object.defineProperty(navigator, 'deviceMemory', {
|
||||
get: () => fingerprint.deviceMemory,
|
||||
});
|
||||
|
||||
// Spoof plugins (realistic count)
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => {
|
||||
const plugins: any = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
plugins.push({
|
||||
name: `Plugin ${i}`,
|
||||
filename: `plugin${i}.dll`,
|
||||
description: `Description ${i}`,
|
||||
});
|
||||
}
|
||||
plugins.length = 5;
|
||||
return plugins;
|
||||
},
|
||||
});
|
||||
|
||||
// Chrome object
|
||||
(window as any).chrome = {
|
||||
runtime: {},
|
||||
loadTimes: () => ({}),
|
||||
csi: () => ({}),
|
||||
app: {},
|
||||
};
|
||||
|
||||
// @ts-ignore - runs in browser context
|
||||
// Permissions
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
// @ts-ignore - runs in browser context
|
||||
window.navigator.permissions.query = (parameters: any) =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({ state: 'denied' } as any)
|
||||
: originalQuery(parameters);
|
||||
});
|
||||
|
||||
// WebGL fingerprint spoofing
|
||||
const getParameterProxyHandler = {
|
||||
apply: function(target: any, thisArg: any, argumentsList: any) {
|
||||
const param = argumentsList[0];
|
||||
// UNMASKED_VENDOR_WEBGL
|
||||
if (param === 37445) {
|
||||
return fingerprint.webglVendor;
|
||||
}
|
||||
// UNMASKED_RENDERER_WEBGL
|
||||
if (param === 37446) {
|
||||
return fingerprint.webglRenderer;
|
||||
}
|
||||
return Reflect.apply(target, thisArg, argumentsList);
|
||||
}
|
||||
};
|
||||
|
||||
// Override WebGL
|
||||
const originalGetContext = HTMLCanvasElement.prototype.getContext;
|
||||
(HTMLCanvasElement.prototype as any).getContext = function(this: HTMLCanvasElement, type: string, ...args: any[]) {
|
||||
const context = originalGetContext.call(this, type, ...args);
|
||||
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
|
||||
const glContext = context as WebGLRenderingContext;
|
||||
const originalGetParameter = glContext.getParameter.bind(glContext);
|
||||
(glContext as any).getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
|
||||
}
|
||||
return context;
|
||||
};
|
||||
|
||||
// Canvas fingerprint noise
|
||||
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
||||
HTMLCanvasElement.prototype.toDataURL = function(type?: string) {
|
||||
const context = this.getContext('2d');
|
||||
if (context) {
|
||||
const imageData = context.getImageData(0, 0, this.width, this.height);
|
||||
for (let i = 0; i < imageData.data.length; i += 4) {
|
||||
// Add tiny noise to RGB values
|
||||
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
|
||||
}
|
||||
context.putImageData(imageData, 0, 0);
|
||||
}
|
||||
return originalToDataURL.call(this, type);
|
||||
};
|
||||
|
||||
// Screen dimensions
|
||||
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
|
||||
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
|
||||
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
|
||||
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
|
||||
|
||||
}, fp);
|
||||
|
||||
// Set timezone via CDP
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply stealth mode to page (legacy - now uses applyFingerprint)
|
||||
*/
|
||||
private async makePageStealthy(page: Page): Promise<void> {
|
||||
// Now handled by applyFingerprint
|
||||
await this.applyFingerprint(page);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,13 +1,32 @@
|
||||
import { Middleware, ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
|
||||
import { logger } from '../services/logger';
|
||||
import { pool } from '../db/migrate';
|
||||
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
|
||||
|
||||
// Diverse, realistic user agents - updated for 2024/2025
|
||||
const USER_AGENTS = [
|
||||
// Chrome on Windows (most common)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
||||
// Chrome on Mac
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Firefox
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
// Safari
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
// Edge
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
|
||||
];
|
||||
|
||||
function getRandomUserAgent(): string {
|
||||
@@ -19,59 +38,90 @@ function sleep(ms: number): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* User Agent Rotation Middleware
|
||||
* User Agent Rotation Middleware - rotates UA on each request for better evasion
|
||||
*/
|
||||
export class UserAgentMiddleware implements Middleware {
|
||||
name = 'UserAgentMiddleware';
|
||||
priority = 100;
|
||||
|
||||
private lastUserAgent: string | null = null;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
if (!request.metadata.userAgent) {
|
||||
request.metadata.userAgent = getRandomUserAgent();
|
||||
// Always rotate UA on retries or bot detection
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
|
||||
if (!request.metadata.userAgent || forceRotation) {
|
||||
// Get a different UA than the last one used
|
||||
let newUA = getRandomUserAgent();
|
||||
let attempts = 0;
|
||||
while (newUA === this.lastUserAgent && attempts < 5) {
|
||||
newUA = getRandomUserAgent();
|
||||
attempts++;
|
||||
}
|
||||
request.metadata.userAgent = newUA;
|
||||
this.lastUserAgent = newUA;
|
||||
|
||||
if (forceRotation) {
|
||||
logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Proxy Rotation Middleware
|
||||
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
||||
*/
|
||||
export class ProxyMiddleware implements Middleware {
|
||||
name = 'ProxyMiddleware';
|
||||
priority = 90;
|
||||
|
||||
private async getActiveProxy(): Promise<ProxyConfig | null> {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true AND is_anonymous = true
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return result.rows[0];
|
||||
} catch (error) {
|
||||
logger.error('scraper', `Failed to get proxy: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
private currentProxyId: number | null = null;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
// Only add proxy if not already set
|
||||
if (!request.metadata.proxy && request.retryCount > 0) {
|
||||
// Use proxy on retries
|
||||
request.metadata.proxy = await this.getActiveProxy();
|
||||
if (request.metadata.proxy) {
|
||||
logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
|
||||
// Always try to use a proxy from the central proxy service
|
||||
// The service handles bot detection timeouts automatically
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
|
||||
if (!request.metadata.proxy || forceRotation) {
|
||||
// Get proxy from central service - it handles timeouts automatically
|
||||
const proxy = await getActiveProxy();
|
||||
if (proxy) {
|
||||
request.metadata.proxy = {
|
||||
host: proxy.host,
|
||||
port: proxy.port,
|
||||
protocol: proxy.protocol,
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
};
|
||||
request.metadata.proxyId = proxy.id;
|
||||
this.currentProxyId = proxy.id;
|
||||
const reason = forceRotation ? 'rotation' : 'initial';
|
||||
logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
} else {
|
||||
logger.warn('scraper', '⚠️ No proxy available - running without proxy');
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
// If bot detection was triggered, put the proxy in timeout
|
||||
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
|
||||
putProxyInTimeout(response.request.metadata.proxyId, 'Bot detection triggered');
|
||||
logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
// If bot detection error, put proxy in timeout
|
||||
if (isBotDetectionError(error.message) && request.metadata.proxyId) {
|
||||
putProxyInTimeout(request.metadata.proxyId, error.message);
|
||||
logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -194,7 +244,7 @@ export class RetryMiddleware implements Middleware {
|
||||
}
|
||||
|
||||
/**
|
||||
* Bot Detection Middleware
|
||||
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
|
||||
*/
|
||||
export class BotDetectionMiddleware implements Middleware {
|
||||
name = 'BotDetectionMiddleware';
|
||||
@@ -203,6 +253,9 @@ export class BotDetectionMiddleware implements Middleware {
|
||||
private detectedCount: number = 0;
|
||||
private readonly DETECTION_THRESHOLD = 3;
|
||||
|
||||
// Export for use by other middlewares
|
||||
static shouldRotateFingerprint: boolean = false;
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
const content = typeof response.content === 'string'
|
||||
? response.content
|
||||
@@ -215,17 +268,29 @@ export class BotDetectionMiddleware implements Middleware {
|
||||
/access denied/i,
|
||||
/you have been blocked/i,
|
||||
/unusual traffic/i,
|
||||
/robot/i
|
||||
/robot/i,
|
||||
/verify.*human/i,
|
||||
/security check/i,
|
||||
/please wait/i,
|
||||
/checking your browser/i,
|
||||
/ray id/i
|
||||
];
|
||||
|
||||
const detected = botIndicators.some(pattern => pattern.test(content));
|
||||
|
||||
if (detected) {
|
||||
this.detectedCount++;
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = true;
|
||||
|
||||
logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
||||
logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
|
||||
|
||||
// Mark the request for rotation on retry
|
||||
response.request.metadata.botDetected = true;
|
||||
response.request.metadata.needsNewBrowser = true;
|
||||
|
||||
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
||||
const error: ScraperError = new Error('Bot detection threshold reached') as ScraperError;
|
||||
const error: ScraperError = new Error('Bot detection threshold reached - rotating fingerprint') as ScraperError;
|
||||
error.type = ErrorType.BOT_DETECTION;
|
||||
error.retryable = true;
|
||||
error.request = response.request;
|
||||
@@ -234,10 +299,25 @@ export class BotDetectionMiddleware implements Middleware {
|
||||
} else {
|
||||
// Gradually decrease detection count on successful requests
|
||||
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = false;
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
// If bot detection error, flag for rotation and allow retry
|
||||
if ('type' in error && (error as ScraperError).type === ErrorType.BOT_DETECTION) {
|
||||
request.metadata.botDetected = true;
|
||||
request.metadata.needsNewBrowser = true;
|
||||
logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
|
||||
|
||||
// Add delay before retry to avoid rate limiting
|
||||
await sleep(5000 + Math.random() * 5000);
|
||||
return null; // Return null to trigger retry
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -154,6 +154,17 @@ export class ImagePipeline implements ItemPipeline<Product> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a URL-safe slug from a product name
|
||||
*/
|
||||
function generateSlug(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.substring(0, 400);
|
||||
}
|
||||
|
||||
/**
|
||||
* Database Pipeline - saves items to database
|
||||
*/
|
||||
@@ -168,6 +179,10 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
// Extract store and category from metadata (set by spider)
|
||||
const storeId = (item as any).storeId;
|
||||
const categoryId = (item as any).categoryId;
|
||||
const dispensaryId = (item as any).dispensaryId;
|
||||
|
||||
// Generate slug from name
|
||||
const slug = generateSlug(item.name);
|
||||
|
||||
if (!storeId || !categoryId) {
|
||||
logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
|
||||
@@ -195,13 +210,13 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
|
||||
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
|
||||
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14)
|
||||
WHERE id = $12
|
||||
`, [
|
||||
item.name, item.description, item.price,
|
||||
item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {}), productId
|
||||
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug
|
||||
]);
|
||||
|
||||
logger.debug('pipeline', `Updated product: ${item.name}`);
|
||||
@@ -209,13 +224,13 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
// Insert new product
|
||||
const insertResult = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, category_id, dutchie_product_id, name, description,
|
||||
store_id, category_id, dispensary_id, dutchie_product_id, slug, name, description,
|
||||
price, strain_type, thc_percentage, cbd_percentage,
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16)
|
||||
RETURNING id
|
||||
`, [
|
||||
storeId, categoryId, item.dutchieProductId, item.name, item.description,
|
||||
storeId, categoryId, dispensaryId, item.dutchieProductId, slug, item.name, item.description,
|
||||
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {})
|
||||
@@ -228,12 +243,19 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
// Download image if needed
|
||||
if (item.imageUrl && !localImagePath) {
|
||||
try {
|
||||
localImagePath = await uploadImageFromUrl(item.imageUrl, productId);
|
||||
// Get store slug for organized image storage
|
||||
const storeResult = await client.query(
|
||||
'SELECT slug FROM stores WHERE id = $1',
|
||||
[storeId]
|
||||
);
|
||||
const storeSlug = storeResult.rows[0]?.slug || undefined;
|
||||
|
||||
const imageSizes = await uploadImageFromUrl(item.imageUrl, productId, storeSlug);
|
||||
// Use thumbnail path for local_image_path
|
||||
localImagePath = imageSizes.thumbnail;
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET local_image_path = $1
|
||||
WHERE id = $2
|
||||
`, [localImagePath, productId]);
|
||||
UPDATE products SET local_image_path = $1 WHERE id = $2
|
||||
`, [imageSizes.thumbnail, productId]);
|
||||
logger.debug('pipeline', `Downloaded image for: ${item.name}`);
|
||||
} catch (error) {
|
||||
logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
|
||||
|
||||
@@ -176,7 +176,7 @@ async function queueProductionCrawls(): Promise<number> {
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`,
|
||||
[dispensary.id]
|
||||
|
||||
@@ -221,7 +221,7 @@ async function queueCategoryProductionCrawls(category?: IntelligenceCategory): P
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`,
|
||||
[dispensary.id, cat]
|
||||
|
||||
@@ -131,7 +131,7 @@ async function getStoreIdForDispensary(dispensaryId: number): Promise<number | n
|
||||
// Check if there's a stores entry linked to this dispensary
|
||||
const result = await pool.query(
|
||||
`SELECT s.id FROM stores s
|
||||
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
|
||||
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
|
||||
@@ -130,6 +130,11 @@ async function uploadToLocalFilesystem(
|
||||
const mediumPath = `${baseFilename}-medium.png`;
|
||||
const fullPath = `${baseFilename}-full.png`;
|
||||
|
||||
// Ensure the target directory exists (in case initializeMinio wasn't called)
|
||||
// Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products')
|
||||
const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename));
|
||||
await fs.mkdir(targetDir, { recursive: true });
|
||||
|
||||
await Promise.all([
|
||||
fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer),
|
||||
fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer),
|
||||
@@ -173,7 +178,12 @@ async function uploadToMinio(
|
||||
};
|
||||
}
|
||||
|
||||
export async function uploadImageFromUrl(imageUrl: string, productId: number, removeBackgrounds = true): Promise<ImageSizes> {
|
||||
export async function uploadImageFromUrl(
|
||||
imageUrl: string,
|
||||
productId: number,
|
||||
storeSlug?: string,
|
||||
removeBackgrounds = true
|
||||
): Promise<ImageSizes> {
|
||||
try {
|
||||
// Download image
|
||||
const response = await axios.get(imageUrl, { responseType: 'arraybuffer' });
|
||||
@@ -184,8 +194,9 @@ export async function uploadImageFromUrl(imageUrl: string, productId: number, re
|
||||
buffer = await removeBackground(buffer);
|
||||
}
|
||||
|
||||
// Generate unique base filename
|
||||
const baseFilename = `products/${productId}-${uuidv4()}`;
|
||||
// Generate unique base filename - organize by store if slug provided
|
||||
const storeDir = storeSlug ? `products/${storeSlug}` : 'products';
|
||||
const baseFilename = `${storeDir}/${productId}-${uuidv4()}`;
|
||||
|
||||
// Create multiple sizes with Sharp and convert to WebP/PNG for better compression
|
||||
// Use PNG for images with transparency
|
||||
|
||||
@@ -93,7 +93,7 @@ export function ScraperMonitor() {
|
||||
marginBottom: '-2px'
|
||||
}}
|
||||
>
|
||||
Brand Scrape Jobs
|
||||
Dispensary Jobs
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setActiveTab('scrapers')}
|
||||
@@ -109,7 +109,7 @@ export function ScraperMonitor() {
|
||||
marginBottom: '-2px'
|
||||
}}
|
||||
>
|
||||
Legacy Scrapers
|
||||
Crawl History
|
||||
</button>
|
||||
</div>
|
||||
|
||||
@@ -232,10 +232,10 @@ export function ScraperMonitor() {
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start' }}>
|
||||
<div style={{ flex: 1 }}>
|
||||
<div style={{ fontSize: '18px', fontWeight: '600', marginBottom: '8px' }}>
|
||||
{job.brand_name}
|
||||
{job.dispensary_name || job.brand_name}
|
||||
</div>
|
||||
<div style={{ fontSize: '14px', color: '#666', marginBottom: '12px' }}>
|
||||
Worker: {job.worker_id} | Job #{job.id}
|
||||
{job.job_type || 'crawl'} | Job #{job.id}
|
||||
</div>
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '12px' }}>
|
||||
<div>
|
||||
@@ -290,8 +290,8 @@ export function ScraperMonitor() {
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||
<thead>
|
||||
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Brand</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Worker</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Type</th>
|
||||
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Saved</th>
|
||||
@@ -302,8 +302,8 @@ export function ScraperMonitor() {
|
||||
<tbody>
|
||||
{recentJobs.map((job: any) => (
|
||||
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
|
||||
<td style={{ padding: '15px' }}>{job.brand_name}</td>
|
||||
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.worker_id || '-'}</td>
|
||||
<td style={{ padding: '15px' }}>{job.dispensary_name || job.brand_name}</td>
|
||||
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.job_type || '-'}</td>
|
||||
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
@@ -481,22 +481,37 @@ export function ScraperMonitor() {
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||
<thead>
|
||||
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Category</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Status</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Scraped</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Crawled</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{history.map((item, index) => (
|
||||
<tr key={index} style={{ borderBottom: '1px solid #eee' }}>
|
||||
<td style={{ padding: '15px' }}>{item.store_name}</td>
|
||||
<td style={{ padding: '15px' }}>{item.category_name}</td>
|
||||
<td style={{ padding: '15px' }}>{item.dispensary_name || item.store_name}</td>
|
||||
<td style={{ padding: '15px' }}>
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontWeight: '600',
|
||||
background: item.status === 'completed' ? '#d1fae5' : item.status === 'failed' ? '#fee2e2' : '#fef3c7',
|
||||
color: item.status === 'completed' ? '#065f46' : item.status === 'failed' ? '#991b1b' : '#92400e'
|
||||
}}>
|
||||
{item.status || '-'}
|
||||
</span>
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
|
||||
{item.products_found || '-'}
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
|
||||
{item.product_count}
|
||||
</td>
|
||||
<td style={{ padding: '15px', color: '#666' }}>
|
||||
{new Date(item.last_scraped_at).toLocaleString()}
|
||||
{item.last_scraped_at ? new Date(item.last_scraped_at).toLocaleString() : '-'}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
|
||||
@@ -17,61 +17,61 @@ const USER_AGENTS = {
|
||||
};
|
||||
|
||||
export function ScraperTools() {
|
||||
const [stores, setStores] = useState<any[]>([]);
|
||||
const [selectedStore, setSelectedStore] = useState<number | null>(null);
|
||||
const [dispensaries, setDispensaries] = useState<any[]>([]);
|
||||
const [selectedDispensary, setSelectedDispensary] = useState<number | null>(null);
|
||||
const [parallelScrapers, setParallelScrapers] = useState(3);
|
||||
const [selectedUserAgent, setSelectedUserAgent] = useState<string>('rotate-desktop');
|
||||
const [scraping, setScraping] = useState(false);
|
||||
const [downloadingImages, setDownloadingImages] = useState(false);
|
||||
const [discoveringCategories, setDiscoveringCategories] = useState(false);
|
||||
const [debugging, setDebugging] = useState(false);
|
||||
const [notification, setNotification] = useState<{ message: string; type: 'success' | 'error' | 'info' } | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
loadStores();
|
||||
loadDispensaries();
|
||||
}, []);
|
||||
|
||||
const loadStores = async () => {
|
||||
const loadDispensaries = async () => {
|
||||
setLoading(true);
|
||||
try {
|
||||
const data = await api.getStores();
|
||||
setStores(data.stores);
|
||||
if (data.stores.length > 0) {
|
||||
setSelectedStore(data.stores[0].id);
|
||||
const data = await api.getDispensaries();
|
||||
// Filter to dispensaries that have a menu_url and are scrape enabled
|
||||
const scrapableDispensaries = data.dispensaries.filter((d: any) => d.menu_url && d.scrape_enabled);
|
||||
setDispensaries(scrapableDispensaries);
|
||||
if (scrapableDispensaries.length > 0) {
|
||||
setSelectedDispensary(scrapableDispensaries[0].id);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to load stores:', error);
|
||||
console.error('Failed to load dispensaries:', error);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleScrape = async () => {
|
||||
if (!selectedStore || scraping) return;
|
||||
if (!selectedDispensary || scraping) return;
|
||||
|
||||
setScraping(true);
|
||||
try {
|
||||
await api.scrapeStore(selectedStore, parallelScrapers, selectedUserAgent || undefined);
|
||||
await api.triggerDispensaryCrawl(selectedDispensary);
|
||||
setNotification({
|
||||
message: `Scrape started with ${parallelScrapers} parallel scrapers using ${USER_AGENTS[selectedUserAgent as keyof typeof USER_AGENTS] || 'Random'} UA! Check the Scraper Monitor for progress.`,
|
||||
message: `Crawl started for dispensary! Check the Scraper Monitor for progress.`,
|
||||
type: 'success'
|
||||
});
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Failed to start scrape: ' + error.message, type: 'error' });
|
||||
setNotification({ message: 'Failed to start crawl: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setScraping(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDownloadImages = async () => {
|
||||
if (!selectedStore || downloadingImages) return;
|
||||
if (!selectedDispensary || downloadingImages) return;
|
||||
|
||||
setDownloadingImages(true);
|
||||
try {
|
||||
const result = await api.downloadStoreImages(selectedStore);
|
||||
// TODO: Implement dispensary image download endpoint
|
||||
setNotification({
|
||||
message: `Image download started! ${result.total_missing} missing images will be downloaded.`,
|
||||
message: `Image download feature coming soon!`,
|
||||
type: 'info'
|
||||
});
|
||||
} catch (error: any) {
|
||||
@@ -81,35 +81,7 @@ export function ScraperTools() {
|
||||
}
|
||||
};
|
||||
|
||||
const handleDiscoverCategories = async () => {
|
||||
if (!selectedStore || discoveringCategories) return;
|
||||
|
||||
setDiscoveringCategories(true);
|
||||
try {
|
||||
await api.discoverStoreCategories(selectedStore);
|
||||
setNotification({ message: 'Category discovery started! Check logs for progress.', type: 'info' });
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Failed to start category discovery: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setDiscoveringCategories(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDebug = async () => {
|
||||
if (!selectedStore || debugging) return;
|
||||
|
||||
setDebugging(true);
|
||||
try {
|
||||
await api.debugScrapeStore(selectedStore);
|
||||
setNotification({ message: 'Debug started! Check Logs page for output.', type: 'info' });
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Debug failed: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setDebugging(false);
|
||||
}
|
||||
};
|
||||
|
||||
const selectedStoreData = stores.find(s => s.id === selectedStore);
|
||||
const selectedDispensaryData = dispensaries.find(d => d.id === selectedDispensary);
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
@@ -133,32 +105,32 @@ export function ScraperTools() {
|
||||
<div className="space-y-6">
|
||||
<div>
|
||||
<h1 className="text-3xl font-bold">Scraper Tools</h1>
|
||||
<p className="text-gray-500 mt-2">Manage scraping operations for your stores</p>
|
||||
<p className="text-gray-500 mt-2">Manage crawling operations for dispensaries</p>
|
||||
</div>
|
||||
|
||||
{/* Store Selection */}
|
||||
{/* Dispensary Selection */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Select Store</h2>
|
||||
<h2 className="card-title">Select Dispensary</h2>
|
||||
<select
|
||||
className="select select-bordered w-full max-w-md"
|
||||
value={selectedStore || ''}
|
||||
onChange={(e) => setSelectedStore(parseInt(e.target.value))}
|
||||
value={selectedDispensary || ''}
|
||||
onChange={(e) => setSelectedDispensary(parseInt(e.target.value))}
|
||||
>
|
||||
{stores.map(store => (
|
||||
<option key={store.id} value={store.id}>
|
||||
{store.name} ({store.product_count || 0} products)
|
||||
{dispensaries.map(disp => (
|
||||
<option key={disp.id} value={disp.id}>
|
||||
{disp.dba_name || disp.name} - {disp.city}, {disp.state}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
|
||||
{selectedStoreData && (
|
||||
{selectedDispensaryData && (
|
||||
<div className="mt-4 p-4 bg-base-200 rounded-lg">
|
||||
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
|
||||
<div>
|
||||
<div className="text-gray-500">Status</div>
|
||||
<div className="font-semibold">
|
||||
{selectedStoreData.scrape_enabled ? (
|
||||
{selectedDispensaryData.scrape_enabled ? (
|
||||
<span className="badge badge-success">Enabled</span>
|
||||
) : (
|
||||
<span className="badge badge-error">Disabled</span>
|
||||
@@ -166,18 +138,18 @@ export function ScraperTools() {
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Categories</div>
|
||||
<div className="font-semibold">{selectedStoreData.category_count || 0}</div>
|
||||
<div className="text-gray-500">Provider</div>
|
||||
<div className="font-semibold">{selectedDispensaryData.provider_type || 'Unknown'}</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Products</div>
|
||||
<div className="font-semibold">{selectedStoreData.product_count || 0}</div>
|
||||
<div className="font-semibold">{selectedDispensaryData.product_count || 0}</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Last Scraped</div>
|
||||
<div className="text-gray-500">Last Crawled</div>
|
||||
<div className="font-semibold">
|
||||
{selectedStoreData.last_scraped_at
|
||||
? new Date(selectedStoreData.last_scraped_at).toLocaleDateString()
|
||||
{selectedDispensaryData.last_crawl_at
|
||||
? new Date(selectedDispensaryData.last_crawl_at).toLocaleDateString()
|
||||
: 'Never'}
|
||||
</div>
|
||||
</div>
|
||||
@@ -189,56 +161,21 @@ export function ScraperTools() {
|
||||
|
||||
{/* Scraper Actions */}
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
|
||||
{/* Scrape Now */}
|
||||
{/* Crawl Now */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Scrape Store</h2>
|
||||
<h2 className="card-title">Crawl Dispensary</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Start scraping products from the selected store
|
||||
Start crawling products from the selected dispensary menu
|
||||
</p>
|
||||
|
||||
<div className="form-control w-full mt-4">
|
||||
<label className="label">
|
||||
<span className="label-text">Parallel Scrapers</span>
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min="1"
|
||||
max="10"
|
||||
value={parallelScrapers}
|
||||
onChange={(e) => setParallelScrapers(parseInt(e.target.value) || 3)}
|
||||
className="input input-bordered w-full"
|
||||
/>
|
||||
<label className="label">
|
||||
<span className="label-text-alt">Number of concurrent scraping processes (1-10)</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div className="form-control w-full mt-4">
|
||||
<label className="label">
|
||||
<span className="label-text">User Agent</span>
|
||||
</label>
|
||||
<select
|
||||
className="select select-bordered w-full"
|
||||
value={selectedUserAgent}
|
||||
onChange={(e) => setSelectedUserAgent(e.target.value)}
|
||||
>
|
||||
{Object.entries(USER_AGENTS).map(([key, label]) => (
|
||||
<option key={key} value={key}>{label}</option>
|
||||
))}
|
||||
</select>
|
||||
<label className="label">
|
||||
<span className="label-text-alt">Browser/bot identity for scraping session</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div className="card-actions justify-end mt-4">
|
||||
<button
|
||||
onClick={handleScrape}
|
||||
disabled={!selectedStore || scraping}
|
||||
disabled={!selectedDispensary || scraping}
|
||||
className={`btn btn-primary ${scraping ? 'loading' : ''}`}
|
||||
>
|
||||
{scraping ? 'Scraping...' : 'Start Scrape'}
|
||||
{scraping ? 'Starting...' : 'Start Crawl'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
@@ -249,13 +186,13 @@ export function ScraperTools() {
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Download Images</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Download missing product images for the selected store
|
||||
Download missing product images for the selected dispensary
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDownloadImages}
|
||||
disabled={!selectedStore || downloadingImages}
|
||||
disabled={!selectedDispensary || downloadingImages}
|
||||
className={`btn btn-secondary ${downloadingImages ? 'loading' : ''}`}
|
||||
>
|
||||
{downloadingImages ? 'Downloading...' : 'Download Missing Images'}
|
||||
@@ -263,46 +200,6 @@ export function ScraperTools() {
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Discover Categories */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Discover Categories</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Automatically discover and create categories from the store
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDiscoverCategories}
|
||||
disabled={!selectedStore || discoveringCategories}
|
||||
className={`btn btn-accent ${discoveringCategories ? 'loading' : ''}`}
|
||||
>
|
||||
{discoveringCategories ? 'Discovering...' : 'Discover Categories'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Debug Scraper */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Debug Scraper</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Run scraper in debug mode and view detailed logs
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDebug}
|
||||
disabled={!selectedStore || debugging}
|
||||
className={`btn btn-warning ${debugging ? 'loading' : ''}`}
|
||||
>
|
||||
{debugging ? 'Debugging...' : 'Start Debug'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Quick Links */}
|
||||
|
||||
Reference in New Issue
Block a user