Initial commit - Dutchie dispensary scraper

This commit is contained in:
Kelly
2025-11-28 19:45:44 -07:00
commit 5757a8e9bd
23375 changed files with 3788799 additions and 0 deletions

65
backend/dist/auth/middleware.js vendored Normal file
View File

@@ -0,0 +1,65 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.generateToken = generateToken;
exports.verifyToken = verifyToken;
exports.authenticateUser = authenticateUser;
exports.authMiddleware = authMiddleware;
exports.requireRole = requireRole;
const jsonwebtoken_1 = __importDefault(require("jsonwebtoken"));
const bcrypt_1 = __importDefault(require("bcrypt"));
const migrate_1 = require("../db/migrate");
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
function generateToken(user) {
return jsonwebtoken_1.default.sign({ id: user.id, email: user.email, role: user.role }, JWT_SECRET, { expiresIn: '7d' });
}
function verifyToken(token) {
try {
return jsonwebtoken_1.default.verify(token, JWT_SECRET);
}
catch (error) {
return null;
}
}
async function authenticateUser(email, password) {
const result = await migrate_1.pool.query('SELECT id, email, password_hash, role FROM users WHERE email = $1', [email]);
if (result.rows.length === 0) {
return null;
}
const user = result.rows[0];
const isValid = await bcrypt_1.default.compare(password, user.password_hash);
if (!isValid) {
return null;
}
return {
id: user.id,
email: user.email,
role: user.role
};
}
function authMiddleware(req, res, next) {
const authHeader = req.headers.authorization;
if (!authHeader || !authHeader.startsWith('Bearer ')) {
return res.status(401).json({ error: 'No token provided' });
}
const token = authHeader.substring(7);
const user = verifyToken(token);
if (!user) {
return res.status(401).json({ error: 'Invalid token' });
}
req.user = user;
next();
}
function requireRole(...roles) {
return (req, res, next) => {
if (!req.user) {
return res.status(401).json({ error: 'Not authenticated' });
}
if (!roles.includes(req.user.role)) {
return res.status(403).json({ error: 'Insufficient permissions' });
}
next();
};
}

41
backend/dist/db/add-jobs-table.js vendored Normal file
View File

@@ -0,0 +1,41 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("./migrate");
async function addJobsTable() {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
await client.query(`
CREATE TABLE IF NOT EXISTS jobs (
id SERIAL PRIMARY KEY,
type VARCHAR(50) NOT NULL,
status VARCHAR(50) DEFAULT 'pending',
store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
progress INTEGER DEFAULT 0,
total_items INTEGER,
processed_items INTEGER DEFAULT 0,
error TEXT,
started_at TIMESTAMP,
completed_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
CREATE INDEX IF NOT EXISTS idx_jobs_type ON jobs(type);
CREATE INDEX IF NOT EXISTS idx_jobs_store_id ON jobs(store_id);
`);
await client.query('COMMIT');
console.log('✅ Jobs table created successfully');
}
catch (error) {
await client.query('ROLLBACK');
console.error('❌ Failed to create jobs table:', error);
throw error;
}
finally {
client.release();
}
}
addJobsTable()
.then(() => process.exit(0))
.catch(() => process.exit(1));

182
backend/dist/db/migrate.js vendored Normal file
View File

@@ -0,0 +1,182 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.pool = void 0;
exports.runMigrations = runMigrations;
const pg_1 = require("pg");
const pool = new pg_1.Pool({
connectionString: process.env.DATABASE_URL,
});
exports.pool = pool;
async function runMigrations() {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Users table
await client.query(`
CREATE TABLE IF NOT EXISTS users (
id SERIAL PRIMARY KEY,
email VARCHAR(255) UNIQUE NOT NULL,
password_hash VARCHAR(255) NOT NULL,
role VARCHAR(50) DEFAULT 'admin',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Stores table
await client.query(`
CREATE TABLE IF NOT EXISTS stores (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) UNIQUE NOT NULL,
dutchie_url TEXT NOT NULL,
active BOOLEAN DEFAULT true,
scrape_enabled BOOLEAN DEFAULT true,
last_scraped_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Categories table (shop, brands, specials)
await client.query(`
CREATE TABLE IF NOT EXISTS categories (
id SERIAL PRIMARY KEY,
store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL,
dutchie_url TEXT NOT NULL,
scrape_enabled BOOLEAN DEFAULT true,
last_scraped_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(store_id, slug)
);
`);
// Products table
await client.query(`
CREATE TABLE IF NOT EXISTS products (
id SERIAL PRIMARY KEY,
store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
dutchie_product_id VARCHAR(255),
name VARCHAR(500) NOT NULL,
slug VARCHAR(500),
description TEXT,
price DECIMAL(10, 2),
original_price DECIMAL(10, 2),
strain_type VARCHAR(100),
thc_percentage DECIMAL(5, 2),
cbd_percentage DECIMAL(5, 2),
brand VARCHAR(255),
weight VARCHAR(100),
image_url TEXT,
local_image_path TEXT,
dutchie_url TEXT NOT NULL,
in_stock BOOLEAN DEFAULT true,
is_special BOOLEAN DEFAULT false,
metadata JSONB,
first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(store_id, dutchie_product_id)
);
`);
// Campaigns table
await client.query(`
CREATE TABLE IF NOT EXISTS campaigns (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) UNIQUE NOT NULL,
description TEXT,
display_style VARCHAR(50) DEFAULT 'grid',
active BOOLEAN DEFAULT true,
start_date TIMESTAMP,
end_date TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Campaign products (many-to-many with ordering)
await client.query(`
CREATE TABLE IF NOT EXISTS campaign_products (
id SERIAL PRIMARY KEY,
campaign_id INTEGER REFERENCES campaigns(id) ON DELETE CASCADE,
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
display_order INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(campaign_id, product_id)
);
`);
// Click tracking
await client.query(`
CREATE TABLE IF NOT EXISTS clicks (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
campaign_id INTEGER REFERENCES campaigns(id) ON DELETE SET NULL,
ip_address VARCHAR(45),
user_agent TEXT,
referrer TEXT,
clicked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Create index on clicked_at for analytics queries
await client.query(`
CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at);
CREATE INDEX IF NOT EXISTS idx_clicks_product_id ON clicks(product_id);
CREATE INDEX IF NOT EXISTS idx_clicks_campaign_id ON clicks(campaign_id);
`);
// Proxies table
await client.query(`
CREATE TABLE IF NOT EXISTS proxies (
id SERIAL PRIMARY KEY,
host VARCHAR(255) NOT NULL,
port INTEGER NOT NULL,
protocol VARCHAR(10) NOT NULL,
username VARCHAR(255),
password VARCHAR(255),
active BOOLEAN DEFAULT true,
is_anonymous BOOLEAN DEFAULT false,
last_tested_at TIMESTAMP,
test_result VARCHAR(50),
response_time_ms INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(host, port, protocol)
);
`);
// Settings table
await client.query(`
CREATE TABLE IF NOT EXISTS settings (
key VARCHAR(255) PRIMARY KEY,
value TEXT NOT NULL,
description TEXT,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Insert default settings
await client.query(`
INSERT INTO settings (key, value, description) VALUES
('scrape_interval_hours', '4', 'How often to scrape stores (in hours)'),
('scrape_specials_time', '00:01', 'Time to scrape specials daily (HH:MM in 24h format)'),
('analytics_retention_days', '365', 'How many days to keep analytics data'),
('proxy_timeout_ms', '3000', 'Proxy timeout in milliseconds'),
('proxy_test_url', 'https://httpbin.org/ip', 'URL to test proxies against')
ON CONFLICT (key) DO NOTHING;
`);
await client.query('COMMIT');
console.log('✅ Migrations completed successfully');
}
catch (error) {
await client.query('ROLLBACK');
console.error('❌ Migration failed:', error);
throw error;
}
finally {
client.release();
}
}
// Run migrations if this file is executed directly
if (require.main === module) {
runMigrations()
.then(() => process.exit(0))
.catch(() => process.exit(1));
}

72
backend/dist/db/seed.js vendored Normal file
View File

@@ -0,0 +1,72 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.seedDatabase = seedDatabase;
const migrate_1 = require("./migrate");
const bcrypt_1 = __importDefault(require("bcrypt"));
async function seedDatabase() {
const client = await migrate_1.pool.connect();
try {
// Create admin user
const adminEmail = process.env.ADMIN_EMAIL || 'admin@example.com';
const adminPassword = process.env.ADMIN_PASSWORD || 'password';
const passwordHash = await bcrypt_1.default.hash(adminPassword, 10);
await client.query(`
INSERT INTO users (email, password_hash, role)
VALUES ($1, $2, 'superadmin')
ON CONFLICT (email) DO UPDATE
SET password_hash = $2, role = 'superadmin'
`, [adminEmail, passwordHash]);
console.log(`✅ Admin user created: ${adminEmail}`);
// Create Deeply Rooted store
const storeResult = await client.query(`
INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled)
VALUES ('Deeply Rooted', 'AZ-Deeply-Rooted', 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', true, true)
ON CONFLICT (slug) DO UPDATE
SET name = 'Deeply Rooted', dutchie_url = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'
RETURNING id
`);
const storeId = storeResult.rows[0].id;
console.log(`✅ Store created: Deeply Rooted (ID: ${storeId})`);
// Create categories for the store
const categories = [
{ name: 'Shop', slug: 'shop', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted' },
{ name: 'Brands', slug: 'brands', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/brands' },
{ name: 'Specials', slug: 'specials', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/specials/sale/66501e094faefa00079b1835' }
];
for (const cat of categories) {
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug) DO UPDATE
SET name = $2, dutchie_url = $4
`, [storeId, cat.name, cat.slug, cat.url]);
}
console.log('✅ Categories created: Shop, Brands, Specials');
// Create a default "Featured Products" campaign
await client.query(`
INSERT INTO campaigns (name, slug, description, display_style, active)
VALUES ('Featured Products', 'featured', 'Default featured products campaign', 'grid', true)
ON CONFLICT (slug) DO NOTHING
`);
console.log('✅ Default campaign created: Featured Products');
console.log('\n🎉 Seeding completed successfully!');
console.log(`\n📧 Login: ${adminEmail}`);
console.log(`🔑 Password: ${adminPassword}`);
}
catch (error) {
console.error('❌ Seeding failed:', error);
throw error;
}
finally {
client.release();
}
}
// Run seed if this file is executed directly
if (require.main === module) {
seedDatabase()
.then(() => process.exit(0))
.catch(() => process.exit(1));
}

View File

@@ -0,0 +1,48 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("./migrate");
async function updateCategoriesHierarchy() {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
// Add parent_id for nested categories
await client.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS parent_id INTEGER REFERENCES categories(id) ON DELETE CASCADE;
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0;
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS description TEXT;
CREATE INDEX IF NOT EXISTS idx_categories_parent_id ON categories(parent_id);
`);
// Add category_path for easy searching (e.g., 'shop/flower')
await client.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS path VARCHAR(500);
CREATE INDEX IF NOT EXISTS idx_categories_path ON categories(path);
`);
// Update existing categories to have paths
await client.query(`
UPDATE categories
SET path = slug
WHERE path IS NULL;
`);
await client.query('COMMIT');
console.log('✅ Categories hierarchy updated successfully');
}
catch (error) {
await client.query('ROLLBACK');
console.error('❌ Failed to update categories:', error);
throw error;
}
finally {
client.release();
}
}
updateCategoriesHierarchy()
.then(() => process.exit(0))
.catch(() => process.exit(1));

57
backend/dist/index.js vendored Normal file
View File

@@ -0,0 +1,57 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = __importDefault(require("express"));
const cors_1 = __importDefault(require("cors"));
const dotenv_1 = __importDefault(require("dotenv"));
const minio_1 = require("./utils/minio");
const logger_1 = require("./services/logger");
dotenv_1.default.config();
const app = (0, express_1.default)();
const PORT = process.env.PORT || 3010;
app.use((0, cors_1.default)());
app.use(express_1.default.json());
app.get('/health', (req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
});
const auth_1 = __importDefault(require("./routes/auth"));
const dashboard_1 = __importDefault(require("./routes/dashboard"));
const stores_1 = __importDefault(require("./routes/stores"));
const categories_1 = __importDefault(require("./routes/categories"));
const products_1 = __importDefault(require("./routes/products"));
const campaigns_1 = __importDefault(require("./routes/campaigns"));
const analytics_1 = __importDefault(require("./routes/analytics"));
const settings_1 = __importDefault(require("./routes/settings"));
const proxies_1 = __importDefault(require("./routes/proxies"));
const logs_1 = __importDefault(require("./routes/logs"));
const scraper_monitor_1 = __importDefault(require("./routes/scraper-monitor"));
app.use('/api/auth', auth_1.default);
app.use('/api/dashboard', dashboard_1.default);
app.use('/api/stores', stores_1.default);
app.use('/api/categories', categories_1.default);
app.use('/api/products', products_1.default);
app.use('/api/campaigns', campaigns_1.default);
app.use('/api/analytics', analytics_1.default);
app.use('/api/settings', settings_1.default);
app.use('/api/proxies', proxies_1.default);
app.use('/api/logs', logs_1.default);
app.use('/api/scraper-monitor', scraper_monitor_1.default);
async function startServer() {
try {
logger_1.logger.info('system', 'Starting server...');
await (0, minio_1.initializeMinio)();
logger_1.logger.info('system', 'Minio initialized');
app.listen(PORT, () => {
logger_1.logger.info('system', `Server running on port ${PORT}`);
console.log(`🚀 Server running on port ${PORT}`);
});
}
catch (error) {
logger_1.logger.error('system', `Failed to start server: ${error}`);
console.error('Failed to start server:', error);
process.exit(1);
}
}
startServer();

121
backend/dist/routes/analytics.js vendored Normal file
View File

@@ -0,0 +1,121 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get analytics overview
router.get('/overview', async (req, res) => {
try {
const { days = 30 } = req.query;
// Total clicks
const clicksResult = await migrate_1.pool.query(`
SELECT COUNT(*) as total_clicks
FROM clicks
WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
`);
// Unique products clicked
const uniqueProductsResult = await migrate_1.pool.query(`
SELECT COUNT(DISTINCT product_id) as unique_products
FROM clicks
WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
`);
// Clicks by day
const clicksByDayResult = await migrate_1.pool.query(`
SELECT DATE(clicked_at) as date, COUNT(*) as clicks
FROM clicks
WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY DATE(clicked_at)
ORDER BY date DESC
`);
// Top products
const topProductsResult = await migrate_1.pool.query(`
SELECT p.id, p.name, p.price, COUNT(c.id) as click_count
FROM clicks c
JOIN products p ON c.product_id = p.id
WHERE c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY p.id, p.name, p.price
ORDER BY click_count DESC
LIMIT 10
`);
res.json({
overview: {
total_clicks: parseInt(clicksResult.rows[0].total_clicks),
unique_products: parseInt(uniqueProductsResult.rows[0].unique_products)
},
clicks_by_day: clicksByDayResult.rows,
top_products: topProductsResult.rows
});
}
catch (error) {
console.error('Error fetching analytics:', error);
res.status(500).json({ error: 'Failed to fetch analytics' });
}
});
// Get product analytics
router.get('/products/:id', async (req, res) => {
try {
const { id } = req.params;
const { days = 30 } = req.query;
// Total clicks for this product
const totalResult = await migrate_1.pool.query(`
SELECT COUNT(*) as total_clicks
FROM clicks
WHERE product_id = $1
AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
`, [id]);
// Clicks by day
const byDayResult = await migrate_1.pool.query(`
SELECT DATE(clicked_at) as date, COUNT(*) as clicks
FROM clicks
WHERE product_id = $1
AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY DATE(clicked_at)
ORDER BY date DESC
`, [id]);
res.json({
product_id: parseInt(id),
total_clicks: parseInt(totalResult.rows[0].total_clicks),
clicks_by_day: byDayResult.rows
});
}
catch (error) {
console.error('Error fetching product analytics:', error);
res.status(500).json({ error: 'Failed to fetch product analytics' });
}
});
// Get campaign analytics
router.get('/campaigns/:id', async (req, res) => {
try {
const { id } = req.params;
const { days = 30 } = req.query;
// Total clicks for this campaign
const totalResult = await migrate_1.pool.query(`
SELECT COUNT(*) as total_clicks
FROM clicks
WHERE campaign_id = $1
AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
`, [id]);
// Clicks by product in this campaign
const byProductResult = await migrate_1.pool.query(`
SELECT p.id, p.name, COUNT(c.id) as clicks
FROM clicks c
JOIN products p ON c.product_id = p.id
WHERE c.campaign_id = $1
AND c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY p.id, p.name
ORDER BY clicks DESC
`, [id]);
res.json({
campaign_id: parseInt(id),
total_clicks: parseInt(totalResult.rows[0].total_clicks),
clicks_by_product: byProductResult.rows
});
}
catch (error) {
console.error('Error fetching campaign analytics:', error);
res.status(500).json({ error: 'Failed to fetch campaign analytics' });
}
});
exports.default = router;

43
backend/dist/routes/auth.js vendored Normal file
View File

@@ -0,0 +1,43 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const router = (0, express_1.Router)();
// Login
router.post('/login', async (req, res) => {
try {
const { email, password } = req.body;
if (!email || !password) {
return res.status(400).json({ error: 'Email and password required' });
}
const user = await (0, middleware_1.authenticateUser)(email, password);
if (!user) {
return res.status(401).json({ error: 'Invalid credentials' });
}
const token = (0, middleware_1.generateToken)(user);
res.json({
token,
user: {
id: user.id,
email: user.email,
role: user.role
}
});
}
catch (error) {
console.error('Login error:', error);
res.status(500).json({ error: 'Internal server error' });
}
});
// Get current user
router.get('/me', middleware_1.authMiddleware, async (req, res) => {
res.json({
user: req.user
});
});
// Refresh token
router.post('/refresh', middleware_1.authMiddleware, async (req, res) => {
const token = (0, middleware_1.generateToken)(req.user);
res.json({ token });
});
exports.default = router;

163
backend/dist/routes/campaigns.js vendored Normal file
View File

@@ -0,0 +1,163 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all campaigns
router.get('/', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT c.*, COUNT(cp.product_id) as product_count
FROM campaigns c
LEFT JOIN campaign_products cp ON c.id = cp.campaign_id
GROUP BY c.id
ORDER BY c.created_at DESC
`);
res.json({ campaigns: result.rows });
}
catch (error) {
console.error('Error fetching campaigns:', error);
res.status(500).json({ error: 'Failed to fetch campaigns' });
}
});
// Get single campaign with products
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const campaignResult = await migrate_1.pool.query(`
SELECT * FROM campaigns WHERE id = $1
`, [id]);
if (campaignResult.rows.length === 0) {
return res.status(404).json({ error: 'Campaign not found' });
}
const productsResult = await migrate_1.pool.query(`
SELECT p.*, cp.display_order
FROM products p
JOIN campaign_products cp ON p.id = cp.product_id
WHERE cp.campaign_id = $1
ORDER BY cp.display_order
`, [id]);
res.json({
campaign: campaignResult.rows[0],
products: productsResult.rows
});
}
catch (error) {
console.error('Error fetching campaign:', error);
res.status(500).json({ error: 'Failed to fetch campaign' });
}
});
// Create campaign
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { name, slug, description, display_style, active, start_date, end_date } = req.body;
if (!name || !slug) {
return res.status(400).json({ error: 'Name and slug required' });
}
const result = await migrate_1.pool.query(`
INSERT INTO campaigns (name, slug, description, display_style, active, start_date, end_date)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING *
`, [name, slug, description, display_style || 'grid', active !== false, start_date, end_date]);
res.status(201).json({ campaign: result.rows[0] });
}
catch (error) {
console.error('Error creating campaign:', error);
if (error.code === '23505') {
return res.status(409).json({ error: 'Campaign slug already exists' });
}
res.status(500).json({ error: 'Failed to create campaign' });
}
});
// Update campaign
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { name, slug, description, display_style, active, start_date, end_date } = req.body;
const result = await migrate_1.pool.query(`
UPDATE campaigns
SET name = COALESCE($1, name),
slug = COALESCE($2, slug),
description = COALESCE($3, description),
display_style = COALESCE($4, display_style),
active = COALESCE($5, active),
start_date = COALESCE($6, start_date),
end_date = COALESCE($7, end_date),
updated_at = CURRENT_TIMESTAMP
WHERE id = $8
RETURNING *
`, [name, slug, description, display_style, active, start_date, end_date, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Campaign not found' });
}
res.json({ campaign: result.rows[0] });
}
catch (error) {
console.error('Error updating campaign:', error);
if (error.code === '23505') {
return res.status(409).json({ error: 'Campaign slug already exists' });
}
res.status(500).json({ error: 'Failed to update campaign' });
}
});
// Delete campaign
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
DELETE FROM campaigns WHERE id = $1 RETURNING id
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Campaign not found' });
}
res.json({ message: 'Campaign deleted successfully' });
}
catch (error) {
console.error('Error deleting campaign:', error);
res.status(500).json({ error: 'Failed to delete campaign' });
}
});
// Add product to campaign
router.post('/:id/products', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { product_id, display_order } = req.body;
if (!product_id) {
return res.status(400).json({ error: 'Product ID required' });
}
const result = await migrate_1.pool.query(`
INSERT INTO campaign_products (campaign_id, product_id, display_order)
VALUES ($1, $2, $3)
ON CONFLICT (campaign_id, product_id)
DO UPDATE SET display_order = $3
RETURNING *
`, [id, product_id, display_order || 0]);
res.status(201).json({ campaign_product: result.rows[0] });
}
catch (error) {
console.error('Error adding product to campaign:', error);
res.status(500).json({ error: 'Failed to add product to campaign' });
}
});
// Remove product from campaign
router.delete('/:id/products/:product_id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id, product_id } = req.params;
const result = await migrate_1.pool.query(`
DELETE FROM campaign_products
WHERE campaign_id = $1 AND product_id = $2
RETURNING *
`, [id, product_id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Product not in campaign' });
}
res.json({ message: 'Product removed from campaign' });
}
catch (error) {
console.error('Error removing product from campaign:', error);
res.status(500).json({ error: 'Failed to remove product from campaign' });
}
});
exports.default = router;

84
backend/dist/routes/categories.js vendored Normal file
View File

@@ -0,0 +1,84 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get categories (flat list)
router.get('/', async (req, res) => {
try {
const { store_id } = req.query;
let query = `
SELECT
c.*,
COUNT(DISTINCT p.id) as product_count,
pc.name as parent_name
FROM categories c
LEFT JOIN products p ON c.id = p.category_id
LEFT JOIN categories pc ON c.parent_id = pc.id
`;
const params = [];
if (store_id) {
query += ' WHERE c.store_id = $1';
params.push(store_id);
}
query += `
GROUP BY c.id, pc.name
ORDER BY c.display_order, c.name
`;
const result = await migrate_1.pool.query(query, params);
res.json({ categories: result.rows });
}
catch (error) {
console.error('Error fetching categories:', error);
res.status(500).json({ error: 'Failed to fetch categories' });
}
});
// Get category tree (hierarchical)
router.get('/tree', async (req, res) => {
try {
const { store_id } = req.query;
if (!store_id) {
return res.status(400).json({ error: 'store_id is required' });
}
// Get all categories for the store
const result = await migrate_1.pool.query(`
SELECT
c.*,
COUNT(DISTINCT p.id) as product_count
FROM categories c
LEFT JOIN products p ON c.id = p.category_id AND p.in_stock = true
WHERE c.store_id = $1
GROUP BY c.id
ORDER BY c.display_order, c.name
`, [store_id]);
// Build tree structure
const categories = result.rows;
const categoryMap = new Map();
const tree = [];
// First pass: create map
categories.forEach(cat => {
categoryMap.set(cat.id, { ...cat, children: [] });
});
// Second pass: build tree
categories.forEach(cat => {
const node = categoryMap.get(cat.id);
if (cat.parent_id) {
const parent = categoryMap.get(cat.parent_id);
if (parent) {
parent.children.push(node);
}
}
else {
tree.push(node);
}
});
res.json({ tree });
}
catch (error) {
console.error('Error fetching category tree:', error);
res.status(500).json({ error: 'Failed to fetch category tree' });
}
});
exports.default = router;

102
backend/dist/routes/dashboard.js vendored Normal file
View File

@@ -0,0 +1,102 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get dashboard stats
router.get('/stats', async (req, res) => {
try {
// Store stats
const storesResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active,
MIN(last_scraped_at) as oldest_scrape,
MAX(last_scraped_at) as latest_scrape
FROM stores
`);
// Product stats
const productsResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE in_stock = true) as in_stock,
COUNT(*) FILTER (WHERE local_image_path IS NOT NULL) as with_images
FROM products
`);
// Campaign stats
const campaignsResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active
FROM campaigns
`);
// Recent clicks (last 24 hours)
const clicksResult = await migrate_1.pool.query(`
SELECT COUNT(*) as clicks_24h
FROM clicks
WHERE clicked_at >= NOW() - INTERVAL '24 hours'
`);
// Recent products added (last 24 hours)
const recentProductsResult = await migrate_1.pool.query(`
SELECT COUNT(*) as new_products_24h
FROM products
WHERE first_seen_at >= NOW() - INTERVAL '24 hours'
`);
// Proxy stats
const proxiesResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active,
COUNT(*) FILTER (WHERE is_anonymous = true) as anonymous
FROM proxies
`);
res.json({
stores: storesResult.rows[0],
products: productsResult.rows[0],
campaigns: campaignsResult.rows[0],
clicks: clicksResult.rows[0],
recent: recentProductsResult.rows[0],
proxies: proxiesResult.rows[0]
});
}
catch (error) {
console.error('Error fetching dashboard stats:', error);
res.status(500).json({ error: 'Failed to fetch dashboard stats' });
}
});
// Get recent activity
router.get('/activity', async (req, res) => {
try {
const { limit = 20 } = req.query;
// Recent scrapes
const scrapesResult = await migrate_1.pool.query(`
SELECT s.name, s.last_scraped_at,
COUNT(p.id) as product_count
FROM stores s
LEFT JOIN products p ON s.id = p.store_id AND p.last_seen_at = s.last_scraped_at
WHERE s.last_scraped_at IS NOT NULL
GROUP BY s.id, s.name, s.last_scraped_at
ORDER BY s.last_scraped_at DESC
LIMIT $1
`, [limit]);
// Recent products
const productsResult = await migrate_1.pool.query(`
SELECT p.name, p.price, s.name as store_name, p.first_seen_at
FROM products p
JOIN stores s ON p.store_id = s.id
ORDER BY p.first_seen_at DESC
LIMIT $1
`, [limit]);
res.json({
recent_scrapes: scrapesResult.rows,
recent_products: productsResult.rows
});
}
catch (error) {
console.error('Error fetching dashboard activity:', error);
res.status(500).json({ error: 'Failed to fetch dashboard activity' });
}
});
exports.default = router;

29
backend/dist/routes/logs.js vendored Normal file
View File

@@ -0,0 +1,29 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const logger_1 = require("../services/logger");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { limit = '100', level, category } = req.query;
const logs = logger_1.logger.getLogs(parseInt(limit), level, category);
res.json({ logs });
}
catch (error) {
console.error('Error fetching logs:', error);
res.status(500).json({ error: 'Failed to fetch logs' });
}
});
router.delete('/', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
logger_1.logger.clear();
res.json({ message: 'Logs cleared' });
}
catch (error) {
console.error('Error clearing logs:', error);
res.status(500).json({ error: 'Failed to clear logs' });
}
});
exports.default = router;

112
backend/dist/routes/products.js vendored Normal file
View File

@@ -0,0 +1,112 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all products with filters
router.get('/', async (req, res) => {
try {
const { store_id, category_id, in_stock, search, limit = 50, offset = 0 } = req.query;
let query = `
SELECT p.*, s.name as store_name, c.name as category_name
FROM products p
LEFT JOIN stores s ON p.store_id = s.id
LEFT JOIN categories c ON p.category_id = c.id
WHERE 1=1
`;
const params = [];
let paramCount = 1;
if (store_id) {
query += ` AND p.store_id = $${paramCount}`;
params.push(store_id);
paramCount++;
}
if (category_id) {
query += ` AND p.category_id = $${paramCount}`;
params.push(category_id);
paramCount++;
}
if (in_stock !== undefined) {
query += ` AND p.in_stock = $${paramCount}`;
params.push(in_stock === 'true');
paramCount++;
}
if (search) {
query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount})`;
params.push(`%${search}%`);
paramCount++;
}
query += ` ORDER BY p.last_seen_at DESC LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
params.push(limit, offset);
const result = await migrate_1.pool.query(query, params);
// Add image URLs
const products = result.rows.map(p => ({
...p,
image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url
}));
// Get total count
let countQuery = `SELECT COUNT(*) FROM products p WHERE 1=1`;
const countParams = [];
let countParamCount = 1;
if (store_id) {
countQuery += ` AND p.store_id = $${countParamCount}`;
countParams.push(store_id);
countParamCount++;
}
if (category_id) {
countQuery += ` AND p.category_id = $${countParamCount}`;
countParams.push(category_id);
countParamCount++;
}
if (in_stock !== undefined) {
countQuery += ` AND p.in_stock = $${countParamCount}`;
countParams.push(in_stock === 'true');
countParamCount++;
}
if (search) {
countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount})`;
countParams.push(`%${search}%`);
countParamCount++;
}
const countResult = await migrate_1.pool.query(countQuery, countParams);
res.json({
products,
total: parseInt(countResult.rows[0].count),
limit: parseInt(limit),
offset: parseInt(offset)
});
}
catch (error) {
console.error('Error fetching products:', error);
res.status(500).json({ error: 'Failed to fetch products' });
}
});
// Get single product
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT p.*, s.name as store_name, c.name as category_name
FROM products p
LEFT JOIN stores s ON p.store_id = s.id
LEFT JOIN categories c ON p.category_id = c.id
WHERE p.id = $1
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Product not found' });
}
const product = result.rows[0];
product.image_url_full = product.local_image_path
? (0, minio_1.getImageUrl)(product.local_image_path)
: product.image_url;
res.json({ product });
}
catch (error) {
console.error('Error fetching product:', error);
res.status(500).json({ error: 'Failed to fetch product' });
}
});
exports.default = router;

174
backend/dist/routes/proxies.js vendored Normal file
View File

@@ -0,0 +1,174 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all proxies
router.get('/', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, active, is_anonymous,
last_tested_at, test_result, response_time_ms, created_at
FROM proxies
ORDER BY created_at DESC
`);
res.json({ proxies: result.rows });
}
catch (error) {
console.error('Error fetching proxies:', error);
res.status(500).json({ error: 'Failed to fetch proxies' });
}
});
// Get single proxy
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, username, active, is_anonymous,
last_tested_at, test_result, response_time_ms, created_at
FROM proxies
WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Proxy not found' });
}
res.json({ proxy: result.rows[0] });
}
catch (error) {
console.error('Error fetching proxy:', error);
res.status(500).json({ error: 'Failed to fetch proxy' });
}
});
// Add single proxy
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { host, port, protocol, username, password } = req.body;
if (!host || !port || !protocol) {
return res.status(400).json({ error: 'Host, port, and protocol required' });
}
// Test and add proxy
const proxyId = await (0, proxy_1.addProxy)(host, port, protocol, username, password);
const result = await migrate_1.pool.query(`
SELECT * FROM proxies WHERE id = $1
`, [proxyId]);
res.status(201).json({ proxy: result.rows[0] });
}
catch (error) {
console.error('Error adding proxy:', error);
res.status(400).json({ error: error.message || 'Failed to add proxy' });
}
});
// Add multiple proxies
router.post('/bulk', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { proxies } = req.body;
if (!proxies || !Array.isArray(proxies)) {
return res.status(400).json({ error: 'Proxies array required' });
}
const result = await (0, proxy_1.addProxiesFromList)(proxies);
res.status(201).json(result);
}
catch (error) {
console.error('Error adding proxies:', error);
res.status(500).json({ error: 'Failed to add proxies' });
}
});
// Test single proxy
router.post('/:id/test', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const proxyResult = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE id = $1
`, [id]);
if (proxyResult.rows.length === 0) {
return res.status(404).json({ error: 'Proxy not found' });
}
const proxy = proxyResult.rows[0];
const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
// Update proxy with test results
await migrate_1.pool.query(`
UPDATE proxies
SET last_tested_at = CURRENT_TIMESTAMP,
test_result = $1,
response_time_ms = $2,
is_anonymous = $3,
active = $4
WHERE id = $5
`, [
testResult.success ? 'success' : 'failed',
testResult.responseTimeMs,
testResult.isAnonymous,
testResult.success,
id
]);
res.json({ test_result: testResult });
}
catch (error) {
console.error('Error testing proxy:', error);
res.status(500).json({ error: 'Failed to test proxy' });
}
});
// Test all proxies
router.post('/test-all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
// Run in background
(0, proxy_1.testAllProxies)().catch(err => {
console.error('Background proxy testing error:', err);
});
res.json({ message: 'Proxy testing started in background' });
}
catch (error) {
console.error('Error starting proxy tests:', error);
res.status(500).json({ error: 'Failed to start proxy tests' });
}
});
// Update proxy
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { host, port, protocol, username, password, active } = req.body;
const result = await migrate_1.pool.query(`
UPDATE proxies
SET host = COALESCE($1, host),
port = COALESCE($2, port),
protocol = COALESCE($3, protocol),
username = COALESCE($4, username),
password = COALESCE($5, password),
active = COALESCE($6, active),
updated_at = CURRENT_TIMESTAMP
WHERE id = $7
RETURNING *
`, [host, port, protocol, username, password, active, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Proxy not found' });
}
res.json({ proxy: result.rows[0] });
}
catch (error) {
console.error('Error updating proxy:', error);
res.status(500).json({ error: 'Failed to update proxy' });
}
});
// Delete proxy
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
DELETE FROM proxies WHERE id = $1 RETURNING id
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Proxy not found' });
}
res.json({ message: 'Proxy deleted successfully' });
}
catch (error) {
console.error('Error deleting proxy:', error);
res.status(500).json({ error: 'Failed to delete proxy' });
}
});
exports.default = router;

130
backend/dist/routes/scraper-monitor.js vendored Normal file
View File

@@ -0,0 +1,130 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.activeScrapers = void 0;
exports.registerScraper = registerScraper;
exports.updateScraperStats = updateScraperStats;
exports.completeScraper = completeScraper;
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
exports.activeScrapers = new Map();
// Get all active scrapers
router.get('/active', async (req, res) => {
try {
const scrapers = Array.from(exports.activeScrapers.values()).map(scraper => ({
...scraper,
duration: Date.now() - scraper.startTime.getTime(),
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 // 1 minute
}));
res.json({ scrapers });
}
catch (error) {
console.error('Error fetching active scrapers:', error);
res.status(500).json({ error: 'Failed to fetch active scrapers' });
}
});
// Get scraper by ID
router.get('/active/:id', async (req, res) => {
try {
const { id } = req.params;
const scraper = exports.activeScrapers.get(id);
if (!scraper) {
return res.status(404).json({ error: 'Scraper not found' });
}
res.json({
scraper: {
...scraper,
duration: Date.now() - scraper.startTime.getTime(),
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000
}
});
}
catch (error) {
console.error('Error fetching scraper:', error);
res.status(500).json({ error: 'Failed to fetch scraper' });
}
});
// Get scraper history (last 50 completed scrapes)
router.get('/history', async (req, res) => {
try {
const { limit = 50, store_id } = req.query;
let query = `
SELECT
s.id as store_id,
s.name as store_name,
c.id as category_id,
c.name as category_name,
c.last_scraped_at,
(
SELECT COUNT(*)
FROM products p
WHERE p.store_id = s.id
AND p.category_id = c.id
) as product_count
FROM stores s
LEFT JOIN categories c ON c.store_id = s.id
WHERE c.last_scraped_at IS NOT NULL
`;
const params = [];
let paramCount = 1;
if (store_id) {
query += ` AND s.id = $${paramCount}`;
params.push(store_id);
paramCount++;
}
query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`;
params.push(limit);
const result = await migrate_1.pool.query(query, params);
res.json({ history: result.rows });
}
catch (error) {
console.error('Error fetching scraper history:', error);
res.status(500).json({ error: 'Failed to fetch scraper history' });
}
});
// Helper function to register a scraper
function registerScraper(id, storeId, storeName, categoryId, categoryName) {
exports.activeScrapers.set(id, {
id,
storeId,
storeName,
categoryId,
categoryName,
startTime: new Date(),
lastUpdate: new Date(),
status: 'running',
stats: {
requestsTotal: 0,
requestsSuccess: 0,
itemsSaved: 0,
itemsDropped: 0,
errorsCount: 0
}
});
}
// Helper function to update scraper stats
function updateScraperStats(id, stats, currentActivity) {
const scraper = exports.activeScrapers.get(id);
if (scraper) {
scraper.stats = { ...scraper.stats, ...stats };
scraper.lastUpdate = new Date();
if (currentActivity) {
scraper.currentActivity = currentActivity;
}
}
}
// Helper function to mark scraper as completed
function completeScraper(id, error) {
const scraper = exports.activeScrapers.get(id);
if (scraper) {
scraper.status = error ? 'error' : 'completed';
scraper.lastUpdate = new Date();
// Remove after 5 minutes
setTimeout(() => {
exports.activeScrapers.delete(id);
}, 5 * 60 * 1000);
}
}
exports.default = router;

118
backend/dist/routes/settings.js vendored Normal file
View File

@@ -0,0 +1,118 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const scheduler_1 = require("../services/scheduler");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all settings
router.get('/', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT key, value, description, updated_at
FROM settings
ORDER BY key
`);
res.json({ settings: result.rows });
}
catch (error) {
console.error('Error fetching settings:', error);
res.status(500).json({ error: 'Failed to fetch settings' });
}
});
// Get single setting
router.get('/:key', async (req, res) => {
try {
const { key } = req.params;
const result = await migrate_1.pool.query(`
SELECT key, value, description, updated_at
FROM settings
WHERE key = $1
`, [key]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Setting not found' });
}
res.json({ setting: result.rows[0] });
}
catch (error) {
console.error('Error fetching setting:', error);
res.status(500).json({ error: 'Failed to fetch setting' });
}
});
// Update setting
router.put('/:key', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { key } = req.params;
const { value } = req.body;
if (value === undefined) {
return res.status(400).json({ error: 'Value required' });
}
const result = await migrate_1.pool.query(`
UPDATE settings
SET value = $1, updated_at = CURRENT_TIMESTAMP
WHERE key = $2
RETURNING *
`, [value, key]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Setting not found' });
}
// Restart scheduler if scrape settings changed
if (key === 'scrape_interval_hours' || key === 'scrape_specials_time') {
console.log('Restarting scheduler due to setting change...');
await (0, scheduler_1.restartScheduler)();
}
res.json({ setting: result.rows[0] });
}
catch (error) {
console.error('Error updating setting:', error);
res.status(500).json({ error: 'Failed to update setting' });
}
});
// Update multiple settings at once
router.put('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { settings } = req.body;
if (!settings || !Array.isArray(settings)) {
return res.status(400).json({ error: 'Settings array required' });
}
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
const updated = [];
let needsSchedulerRestart = false;
for (const setting of settings) {
const result = await client.query(`
UPDATE settings
SET value = $1, updated_at = CURRENT_TIMESTAMP
WHERE key = $2
RETURNING *
`, [setting.value, setting.key]);
if (result.rows.length > 0) {
updated.push(result.rows[0]);
if (setting.key === 'scrape_interval_hours' || setting.key === 'scrape_specials_time') {
needsSchedulerRestart = true;
}
}
}
await client.query('COMMIT');
if (needsSchedulerRestart) {
console.log('Restarting scheduler due to setting changes...');
await (0, scheduler_1.restartScheduler)();
}
res.json({ settings: updated });
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
catch (error) {
console.error('Error updating settings:', error);
res.status(500).json({ error: 'Failed to update settings' });
}
});
exports.default = router;

257
backend/dist/routes/stores.js vendored Normal file
View File

@@ -0,0 +1,257 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const scraper_v2_1 = require("../scraper-v2");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all stores
router.get('/', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT
s.*,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count
FROM stores s
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
GROUP BY s.id
ORDER BY s.name
`);
res.json({ stores: result.rows });
}
catch (error) {
console.error('Error fetching stores:', error);
res.status(500).json({ error: 'Failed to fetch stores' });
}
});
// Get single store
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT
s.*,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count
FROM stores s
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
WHERE s.id = $1
GROUP BY s.id
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result.rows[0]);
}
catch (error) {
console.error('Error fetching store:', error);
res.status(500).json({ error: 'Failed to fetch store' });
}
});
// Create store
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
const result = await migrate_1.pool.query(`
INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled)
VALUES ($1, $2, $3, $4, $5)
RETURNING *
`, [name, slug, dutchie_url, active ?? true, scrape_enabled ?? true]);
res.status(201).json(result.rows[0]);
}
catch (error) {
console.error('Error creating store:', error);
res.status(500).json({ error: 'Failed to create store' });
}
});
// Update store
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
const result = await migrate_1.pool.query(`
UPDATE stores
SET name = COALESCE($1, name),
slug = COALESCE($2, slug),
dutchie_url = COALESCE($3, dutchie_url),
active = COALESCE($4, active),
scrape_enabled = COALESCE($5, scrape_enabled),
updated_at = CURRENT_TIMESTAMP
WHERE id = $6
RETURNING *
`, [name, slug, dutchie_url, active, scrape_enabled, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result.rows[0]);
}
catch (error) {
console.error('Error updating store:', error);
res.status(500).json({ error: 'Failed to update store' });
}
});
// Delete store
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query('DELETE FROM stores WHERE id = $1 RETURNING *', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json({ message: 'Store deleted successfully' });
}
catch (error) {
console.error('Error deleting store:', error);
res.status(500).json({ error: 'Failed to delete store' });
}
});
// Trigger scrape for a store
router.post('/:id/scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { parallel = 3 } = req.body; // Default to 3 parallel scrapers
const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
(0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel)).catch(err => {
console.error('Background scrape error:', err);
});
res.json({
message: 'Scrape started',
parallel: parseInt(parallel)
});
}
catch (error) {
console.error('Error triggering scrape:', error);
res.status(500).json({ error: 'Failed to trigger scrape' });
}
});
// Download missing images for a store
router.post('/:id/download-images', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const storeResult = await migrate_1.pool.query('SELECT id, name FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
const store = storeResult.rows[0];
const productsResult = await migrate_1.pool.query(`
SELECT id, name, image_url
FROM products
WHERE store_id = $1
AND image_url IS NOT NULL
AND local_image_path IS NULL
`, [id]);
(async () => {
const { uploadImageFromUrl } = await Promise.resolve().then(() => __importStar(require('../utils/minio')));
let downloaded = 0;
for (const product of productsResult.rows) {
try {
console.log(`📸 Downloading image for: ${product.name}`);
const localPath = await uploadImageFromUrl(product.image_url, product.id);
await migrate_1.pool.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localPath, product.id]);
downloaded++;
}
catch (error) {
console.error(`Failed to download image for ${product.name}:`, error);
}
}
console.log(`✅ Downloaded ${downloaded} of ${productsResult.rows.length} missing images for ${store.name}`);
})().catch(err => console.error('Background image download error:', err));
res.json({
message: 'Image download started',
total_missing: productsResult.rows.length
});
}
catch (error) {
console.error('Error triggering image download:', error);
res.status(500).json({ error: 'Failed to trigger image download' });
}
});
// Discover categories for a store
router.post('/:id/discover-categories', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
(0, scraper_v2_1.discoverCategories)(parseInt(id)).catch(err => {
console.error('Background category discovery error:', err);
});
res.json({ message: 'Category discovery started' });
}
catch (error) {
console.error('Error triggering category discovery:', error);
res.status(500).json({ error: 'Failed to trigger category discovery' });
}
});
// Debug scraper
router.post('/:id/debug-scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
console.log('Debug scrape triggered for store:', id);
const categoryResult = await migrate_1.pool.query(`
SELECT c.dutchie_url, c.name
FROM categories c
WHERE c.store_id = $1 AND c.slug = 'edibles'
LIMIT 1
`, [id]);
if (categoryResult.rows.length === 0) {
return res.status(404).json({ error: 'Edibles category not found' });
}
console.log('Found category:', categoryResult.rows[0]);
const { debugDutchiePage } = await Promise.resolve().then(() => __importStar(require('../services/scraper-debug')));
debugDutchiePage(categoryResult.rows[0].dutchie_url).catch(err => {
console.error('Debug error:', err);
});
res.json({ message: 'Debug started, check logs', url: categoryResult.rows[0].dutchie_url });
}
catch (error) {
console.error('Debug endpoint error:', error);
res.status(500).json({ error: 'Failed to debug' });
}
});
exports.default = router;

324
backend/dist/scraper-v2/downloader.js vendored Normal file
View File

@@ -0,0 +1,324 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Downloader = void 0;
const puppeteer_1 = __importDefault(require("puppeteer"));
const axios_1 = __importDefault(require("axios"));
const types_1 = require("./types");
const logger_1 = require("../services/logger");
class Downloader {
browser = null;
page = null;
pageInUse = false;
/**
* Initialize browser instance (lazy initialization)
*/
async getBrowser() {
if (!this.browser || !this.browser.isConnected()) {
const launchOptions = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
};
this.browser = await puppeteer_1.default.launch(launchOptions);
logger_1.logger.info('scraper', 'Browser instance created');
}
return this.browser;
}
/**
* Get or create a page instance
*/
async getPage() {
if (!this.page || this.page.isClosed()) {
const browser = await this.getBrowser();
this.page = await browser.newPage();
await this.page.setViewport({ width: 1920, height: 1080 });
logger_1.logger.debug('scraper', 'New page created');
}
return this.page;
}
/**
* Apply stealth mode to page
*/
async makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
// @ts-ignore - runs in browser context
window.chrome = {
runtime: {},
};
// @ts-ignore - runs in browser context
const originalQuery = window.navigator.permissions.query;
// @ts-ignore - runs in browser context
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
});
}
/**
* Configure proxy for browser
*/
getProxyArgs(proxy) {
if (proxy.protocol === 'socks5') {
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
}
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
}
return [];
}
/**
* HTTP-based fetch (lightweight, fast)
*/
async httpFetch(request) {
try {
const config = {
timeout: 30000,
headers: {
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
...request.metadata.headers
},
validateStatus: () => true // Don't throw on any status
};
// Add proxy if available
if (request.metadata.proxy) {
const proxy = request.metadata.proxy;
config.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol
};
if (proxy.username && proxy.password) {
config.proxy.auth = {
username: proxy.username,
password: proxy.password
};
}
}
const response = await axios_1.default.get(request.url, config);
return {
url: request.url,
statusCode: response.status,
content: response.data,
metadata: {
headers: response.headers,
method: 'http'
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = true;
scraperError.request = request;
throw scraperError;
}
}
/**
* Browser-based fetch (for JS-heavy sites)
*/
async browserFetch(request) {
// Wait if page is in use
while (this.pageInUse) {
await new Promise(resolve => setTimeout(resolve, 100));
}
this.pageInUse = true;
try {
const page = await this.getPage();
// Apply stealth mode if required
if (request.metadata.requiresStealth) {
await this.makePageStealthy(page);
}
// Set user agent
if (request.metadata.userAgent) {
await page.setUserAgent(request.metadata.userAgent);
}
// Navigate to page
const navigationPromise = page.goto(request.url, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
const response = await navigationPromise;
if (!response) {
throw new Error('Navigation failed - no response');
}
// Wait for initial render
await page.waitForTimeout(3000);
// Check for lazy-loaded content
await this.autoScroll(page);
// Get page content
const content = await page.content();
const statusCode = response.status();
return {
url: request.url,
statusCode,
content,
metadata: {
method: 'browser',
finalUrl: page.url()
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.message.includes('net::')) {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else if (error.message.includes('404')) {
scraperError.type = types_1.ErrorType.NOT_FOUND;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
scraperError.request = request;
throw scraperError;
}
finally {
this.pageInUse = false;
}
}
/**
* Auto-scroll to load lazy content
*/
async autoScroll(page) {
try {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const maxScrolls = 20; // Prevent infinite scrolling
let scrollCount = 0;
const timer = setInterval(() => {
// @ts-ignore - runs in browser context
const scrollHeight = document.body.scrollHeight;
// @ts-ignore - runs in browser context
window.scrollBy(0, distance);
totalHeight += distance;
scrollCount++;
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
clearInterval(timer);
// Scroll back to top
// @ts-ignore - runs in browser context
window.scrollTo(0, 0);
resolve();
}
}, 200);
});
});
// Wait for any lazy-loaded content
await page.waitForTimeout(1000);
}
catch (error) {
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
}
}
/**
* Main fetch method - tries HTTP first, falls back to browser
*/
async fetch(request) {
const startTime = Date.now();
try {
// Force browser mode if required
if (request.metadata.requiresBrowser) {
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
return response;
}
// Try HTTP first (faster)
try {
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
const response = await this.httpFetch(request);
// Check if we got a meaningful response
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
return response;
}
// Fall through to browser mode for non-2xx responses
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
}
catch (httpError) {
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
}
// Fall back to browser
request.metadata.requiresBrowser = true;
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
return response;
}
catch (error) {
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
throw error;
}
}
/**
* Evaluate JavaScript in the current page context
*/
async evaluate(fn) {
if (!this.page || this.page.isClosed()) {
throw new Error('No active page for evaluation');
}
return await this.page.evaluate(fn);
}
/**
* Get the current page (for custom operations)
*/
async getCurrentPage() {
return this.page;
}
/**
* Close the browser
*/
async close() {
if (this.page && !this.page.isClosed()) {
await this.page.close();
this.page = null;
}
if (this.browser && this.browser.isConnected()) {
await this.browser.close();
this.browser = null;
logger_1.logger.info('scraper', 'Browser closed');
}
}
/**
* Clean up resources
*/
async cleanup() {
await this.close();
}
}
exports.Downloader = Downloader;

652
backend/dist/scraper-v2/engine.js vendored Normal file
View File

@@ -0,0 +1,652 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.DutchieSpider = exports.ScraperEngine = void 0;
const scheduler_1 = require("./scheduler");
const downloader_1 = require("./downloader");
const middlewares_1 = require("./middlewares");
const pipelines_1 = require("./pipelines");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
/**
* Main Scraper Engine - orchestrates the entire scraping process
*/
class ScraperEngine {
scheduler;
downloader;
middlewareEngine;
pipelineEngine;
stats;
isRunning = false;
concurrency = 1; // Conservative default
constructor(concurrency = 1) {
this.scheduler = new scheduler_1.RequestScheduler();
this.downloader = new downloader_1.Downloader();
this.middlewareEngine = new middlewares_1.MiddlewareEngine();
this.pipelineEngine = new pipelines_1.PipelineEngine();
this.concurrency = concurrency;
// Initialize stats
this.stats = {
requestsTotal: 0,
requestsSuccess: 0,
requestsFailed: 0,
itemsScraped: 0,
itemsSaved: 0,
itemsDropped: 0,
errorsCount: 0,
startTime: new Date()
};
// Setup middlewares
this.setupMiddlewares();
// Setup pipelines
this.setupPipelines();
}
/**
* Setup middleware chain
*/
setupMiddlewares() {
this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware());
this.middlewareEngine.use(new middlewares_1.ProxyMiddleware());
this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware());
this.middlewareEngine.use(new middlewares_1.RetryMiddleware());
this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware());
this.middlewareEngine.use(new middlewares_1.StealthMiddleware());
}
/**
* Setup pipeline chain
*/
setupPipelines() {
this.pipelineEngine.use(new pipelines_1.ValidationPipeline());
this.pipelineEngine.use(new pipelines_1.SanitizationPipeline());
this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline());
this.pipelineEngine.use(new pipelines_1.ImagePipeline());
this.pipelineEngine.use(new pipelines_1.StatsPipeline());
this.pipelineEngine.use(new pipelines_1.DatabasePipeline());
}
/**
* Add a request to the queue
*/
enqueue(request) {
this.scheduler.enqueue(request);
}
/**
* Start the scraping engine
*/
async start() {
if (this.isRunning) {
logger_1.logger.warn('scraper', 'Engine is already running');
return;
}
this.isRunning = true;
this.stats.startTime = new Date();
logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`);
// Process queue
await this.processQueue();
this.isRunning = false;
this.stats.endTime = new Date();
this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime();
logger_1.logger.info('scraper', `✅ Scraper engine finished`);
this.logStats();
// Cleanup
await this.downloader.cleanup();
}
/**
* Process the request queue
*/
async processQueue() {
while (!this.scheduler.isEmpty() && this.isRunning) {
const request = this.scheduler.dequeue();
if (!request) {
// Wait a bit and check again
await new Promise(resolve => setTimeout(resolve, 100));
continue;
}
try {
await this.processRequest(request);
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process request: ${error}`);
}
}
}
/**
* Process a single request
*/
async processRequest(request) {
this.stats.requestsTotal++;
try {
logger_1.logger.debug('scraper', `Processing: ${request.url}`);
// Apply request middlewares
const processedRequest = await this.middlewareEngine.processRequest(request);
// Download
let response = await this.downloader.fetch(processedRequest);
// Apply response middlewares
response = await this.middlewareEngine.processResponse(response);
// Parse response using callback
const parseResult = await request.callback(response);
// Process items through pipeline
if (parseResult.items && parseResult.items.length > 0) {
for (const item of parseResult.items) {
await this.processItem(item, 'default');
}
}
// Enqueue follow-up requests
if (parseResult.requests && parseResult.requests.length > 0) {
for (const followUpRequest of parseResult.requests) {
this.scheduler.enqueue(followUpRequest);
}
}
this.stats.requestsSuccess++;
this.scheduler.markComplete(request);
}
catch (error) {
this.stats.requestsFailed++;
this.stats.errorsCount++;
logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`);
// Apply error middlewares
const handledError = await this.middlewareEngine.processError(error, request);
// If error is null, it was handled (e.g., retry)
if (handledError === null) {
this.scheduler.requeueForRetry(request);
}
else {
this.scheduler.markComplete(request);
// Call error handler if provided
if (request.errorHandler) {
await request.errorHandler(error, request);
}
}
}
}
/**
* Process an item through pipelines
*/
async processItem(item, spider) {
this.stats.itemsScraped++;
try {
const processedItem = await this.pipelineEngine.processItem(item, spider);
if (processedItem) {
this.stats.itemsSaved++;
}
else {
this.stats.itemsDropped++;
}
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process item: ${error}`);
this.stats.itemsDropped++;
}
}
/**
* Log statistics
*/
logStats() {
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', '📊 Scraper Statistics');
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`);
logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`);
logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`);
logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`);
// Get stats from StatsPipeline
const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline');
if (statsPipeline) {
const itemStats = statsPipeline.getStats();
logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`);
}
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
}
/**
* Stop the engine
*/
stop() {
this.isRunning = false;
logger_1.logger.info('scraper', 'Stopping scraper engine...');
}
/**
* Get current stats
*/
getStats() {
return { ...this.stats };
}
/**
* Get queue stats
*/
getQueueStats() {
return this.scheduler.getStats();
}
}
exports.ScraperEngine = ScraperEngine;
/**
* Spider for scraping Dutchie categories
*/
class DutchieSpider {
engine;
constructor(engine) {
this.engine = engine;
}
/**
* Scrape a category
*/
async scrapeCategory(storeId, categoryId) {
logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`);
const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`;
let registerScraper, updateScraperStats, completeScraper;
try {
// Import monitoring functions
const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor')));
registerScraper = monitor.registerScraper;
updateScraperStats = monitor.updateScraperStats;
completeScraper = monitor.completeScraper;
}
catch (e) {
// Monitoring not available
}
try {
// Get category info
const categoryResult = await migrate_1.pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
FROM categories c
JOIN stores s ON c.store_id = s.id
WHERE c.id = $1
`, [categoryId]);
if (categoryResult.rows.length === 0) {
throw new Error('Category not found');
}
const category = categoryResult.rows[0];
logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`);
// Register with monitoring system
if (registerScraper) {
registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
}
// Mark products as out of stock before scraping
await migrate_1.pool.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
`, [storeId, categoryId]);
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Marking products as out of stock');
}
// Enqueue category page request
this.engine.enqueue({
url: category.dutchie_url,
priority: 100,
maxRetries: 3,
metadata: {
requiresBrowser: true,
storeId,
categoryId,
categorySlug: category.slug,
storeSlug: category.store_slug
},
callback: this.parseCategoryPage.bind(this)
});
// Start the engine
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Scraping category page');
}
await this.engine.start();
// Update stats from engine
const engineStats = this.engine.getStats();
if (updateScraperStats) {
updateScraperStats(scraperId, {
requestsTotal: engineStats.requestsTotal,
requestsSuccess: engineStats.requestsSuccess,
itemsSaved: engineStats.itemsSaved,
itemsDropped: engineStats.itemsDropped,
errorsCount: engineStats.errorsCount
}, 'Finalizing');
}
// Update category last_scraped_at
await migrate_1.pool.query(`
UPDATE categories
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`);
if (completeScraper) {
completeScraper(scraperId);
}
}
catch (error) {
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
if (completeScraper) {
completeScraper(scraperId, error.toString());
}
throw error;
}
}
/**
* Parse category page (product listing)
*/
async parseCategoryPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
logger_1.logger.info('scraper', 'Parsing category page...');
// Extract product cards
const productCards = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
const items = [];
cards.forEach((card) => {
try {
const allText = card.textContent || '';
// Extract name
let name = '';
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
for (const sel of nameSelectors) {
const el = card.querySelector(sel);
if (el?.textContent?.trim()) {
name = el.textContent.trim().split('\n')[0].trim();
break;
}
}
if (!name || name.length < 2)
return;
// Extract price
let price = null;
let originalPrice = null;
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
price = parseFloat(priceMatches[0].replace('$', ''));
if (priceMatches.length > 1) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
// Extract link
const linkEl = card.querySelector('a[href*="/product/"]');
let href = linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
// @ts-ignore - runs in browser context
href = window.location.origin + href;
}
items.push({ name, price, originalPrice, href });
}
catch (err) {
console.error('Error parsing product card:', err);
}
});
return items;
});
logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`);
// Create follow-up requests for each product
const requests = productCards.map((card, index) => ({
url: card.href,
priority: 50,
maxRetries: 3,
metadata: {
...response.request.metadata,
productName: card.name,
productPrice: card.price,
productOriginalPrice: card.originalPrice,
requiresBrowser: true
},
callback: this.parseProductPage.bind(this)
}));
return { items: [], requests };
}
/**
* Parse individual product page
*/
async parseProductPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
const productName = response.request.metadata.productName;
logger_1.logger.debug('scraper', `Parsing product: ${productName}`);
// Extract product details
const details = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const allText = document.body.textContent || '';
// Extract image
let fullSizeImage = null;
const mainImageSelectors = [
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
];
for (const sel of mainImageSelectors) {
// @ts-ignore - runs in browser context
const img = document.querySelector(sel);
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
break;
}
}
// Extract description
let description = '';
const descSelectors = [
'[class*="description"]',
'[class*="Description"]',
'[data-testid*="description"]',
'p[class*="product"]'
];
for (const sel of descSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim() && el.textContent.length > 20) {
description = el.textContent.trim();
break;
}
}
// Extract THC/CBD
let thc = null;
const thcPatterns = [
/THC[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+THC/i
];
for (const pattern of thcPatterns) {
const match = allText.match(pattern);
if (match) {
thc = parseFloat(match[1]);
break;
}
}
let cbd = null;
const cbdPatterns = [
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+CBD/i
];
for (const pattern of cbdPatterns) {
const match = allText.match(pattern);
if (match) {
cbd = parseFloat(match[1]);
break;
}
}
// Extract strain type
let strainType = null;
if (allText.match(/\bindica\b/i))
strainType = 'Indica';
else if (allText.match(/\bsativa\b/i))
strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i))
strainType = 'Hybrid';
// Extract brand
let brand = null;
const brandSelectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]'
];
for (const sel of brandSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim()) {
brand = el.textContent.trim();
break;
}
}
// Extract metadata
const terpenes = [];
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene'];
terpeneNames.forEach(terp => {
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
terpenes.push(terp);
}
});
const effects = [];
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic'];
effectNames.forEach(effect => {
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
effects.push(effect);
}
});
return {
fullSizeImage,
description,
thc,
cbd,
strainType,
brand,
terpenes,
effects
};
});
// Create product item
const product = {
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
name: productName || 'Unknown Product',
description: details.description,
price: response.request.metadata.productPrice,
originalPrice: response.request.metadata.productOriginalPrice,
thcPercentage: details.thc || undefined,
cbdPercentage: details.cbd || undefined,
strainType: details.strainType || undefined,
brand: details.brand || undefined,
imageUrl: details.fullSizeImage || undefined,
dutchieUrl: response.url,
metadata: {
terpenes: details.terpenes,
effects: details.effects
},
storeId: response.request.metadata.storeId,
categoryId: response.request.metadata.categoryId
};
return { items: [product], requests: [] };
}
/**
* Scrape entire store
*/
async scrapeStore(storeId, parallel = 3) {
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
try {
// Get all leaf categories (no children)
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
AND NOT EXISTS (
SELECT 1 FROM categories child
WHERE child.parent_id = c.id
)
ORDER BY c.name
`, [storeId]);
const categories = categoriesResult.rows;
logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`);
if (parallel === 1) {
// Sequential scraping (original behavior)
for (const category of categories) {
try {
await this.scrapeCategory(storeId, category.id);
await new Promise(resolve => setTimeout(resolve, 3000));
}
catch (error) {
logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`);
}
}
}
else {
// Parallel scraping with concurrency limit
const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel);
const successful = results.filter(r => r.status === 'fulfilled').length;
const failed = results.filter(r => r.status === 'rejected').length;
logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`);
}
// Update store last_scraped_at
await migrate_1.pool.query(`
UPDATE stores
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [storeId]);
logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`);
}
catch (error) {
logger_1.logger.error('scraper', `Store scrape failed: ${error}`);
throw error;
}
}
/**
* Scrape multiple categories in parallel with concurrency limit
*/
async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) {
const results = [];
// Process categories in batches
for (let i = 0; i < categories.length; i += concurrency) {
const batch = categories.slice(i, i + concurrency);
logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`);
const batchPromises = batch.map(category => {
// Create a new spider instance for each category
const engine = new ScraperEngine(1); // 1 concurrent request per spider
const spider = new DutchieSpider(engine);
return spider.scrapeCategory(storeId, category.id)
.catch(error => {
logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`);
throw error;
});
});
const batchResults = await Promise.allSettled(batchPromises);
results.push(...batchResults);
// Delay between batches to avoid overwhelming the server
if (i + concurrency < categories.length) {
logger_1.logger.info('scraper', 'Waiting 5s before next batch...');
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
return results;
}
}
exports.DutchieSpider = DutchieSpider;

108
backend/dist/scraper-v2/index.js vendored Normal file
View File

@@ -0,0 +1,108 @@
"use strict";
/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0;
exports.scrapeCategory = scrapeCategory;
exports.scrapeStore = scrapeStore;
exports.discoverCategories = discoverCategories;
var engine_1 = require("./engine");
Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } });
Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } });
var scheduler_1 = require("./scheduler");
Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } });
var downloader_1 = require("./downloader");
Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } });
var navigation_1 = require("./navigation");
Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } });
var middlewares_1 = require("./middlewares");
Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } });
Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } });
Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } });
Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } });
Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } });
Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } });
Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } });
var pipelines_1 = require("./pipelines");
Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } });
Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } });
Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } });
Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } });
Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } });
Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } });
Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } });
__exportStar(require("./types"), exports);
// Main API functions
const engine_2 = require("./engine");
const navigation_2 = require("./navigation");
const downloader_2 = require("./downloader");
const logger_1 = require("../services/logger");
/**
* Scrape a single category
*/
async function scrapeCategory(storeId, categoryId) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
async function scrapeStore(storeId, parallel = 3) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
async function discoverCategories(storeId) {
const downloader = new downloader_2.Downloader();
const discovery = new navigation_2.NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
}
catch (error) {
logger_1.logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
}
finally {
await downloader.cleanup();
}
}

263
backend/dist/scraper-v2/middlewares.js vendored Normal file
View File

@@ -0,0 +1,263 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
const types_1 = require("./types");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware
*/
class UserAgentMiddleware {
name = 'UserAgentMiddleware';
priority = 100;
async processRequest(request) {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
}
return request;
}
}
exports.UserAgentMiddleware = UserAgentMiddleware;
/**
* Proxy Rotation Middleware
*/
class ProxyMiddleware {
name = 'ProxyMiddleware';
priority = 90;
async getActiveProxy() {
try {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
catch (error) {
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
async processRequest(request) {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
}
}
return request;
}
}
exports.ProxyMiddleware = ProxyMiddleware;
/**
* Rate Limiting Middleware with Adaptive Delays
*/
class RateLimitMiddleware {
name = 'RateLimitMiddleware';
priority = 80;
requestTimes = [];
errorCount = 0;
baseDelay = 2000; // 2 seconds base delay
maxDelay = 30000; // 30 seconds max
async processRequest(request) {
await this.waitForNextRequest();
return request;
}
async processResponse(response) {
// Record success - gradually reduce error count
this.errorCount = Math.max(0, this.errorCount - 1);
return response;
}
async processError(error) {
// Record error - increase delay
this.errorCount++;
return error;
}
async waitForNextRequest() {
// Calculate adaptive delay based on error count
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
// Add random jitter (±20%)
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
const delay = adaptiveDelay + jitter;
const now = Date.now();
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
const timeSinceLast = now - lastRequest;
if (timeSinceLast < delay) {
const waitTime = delay - timeSinceLast;
logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
await sleep(waitTime);
}
this.requestTimes.push(Date.now());
this.cleanup();
}
cleanup() {
// Keep only last minute of requests
const cutoff = Date.now() - 60000;
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
}
setBaseDelay(ms) {
this.baseDelay = ms;
}
}
exports.RateLimitMiddleware = RateLimitMiddleware;
/**
* Retry Middleware with Exponential Backoff
*/
class RetryMiddleware {
name = 'RetryMiddleware';
priority = 70;
isRetryable(error) {
const retryableErrors = [
types_1.ErrorType.NETWORK_ERROR,
types_1.ErrorType.TIMEOUT,
types_1.ErrorType.SERVER_ERROR
];
if ('type' in error) {
return retryableErrors.includes(error.type);
}
// Check error message for common retryable patterns
const message = error.message.toLowerCase();
return (message.includes('timeout') ||
message.includes('network') ||
message.includes('econnreset') ||
message.includes('econnrefused') ||
message.includes('500') ||
message.includes('502') ||
message.includes('503'));
}
async processError(error, request) {
if (!this.isRetryable(error)) {
logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
return error;
}
if (request.retryCount < request.maxRetries) {
// Calculate backoff delay
const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
await sleep(backoffDelay);
// Return null to indicate retry should happen
return null;
}
logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
return error;
}
}
exports.RetryMiddleware = RetryMiddleware;
/**
* Bot Detection Middleware
*/
class BotDetectionMiddleware {
name = 'BotDetectionMiddleware';
priority = 60;
detectedCount = 0;
DETECTION_THRESHOLD = 3;
async processResponse(response) {
const content = typeof response.content === 'string'
? response.content
: JSON.stringify(response.content);
// Check for bot detection indicators
const botIndicators = [
/captcha/i,
/cloudflare/i,
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error = new Error('Bot detection threshold reached');
error.type = types_1.ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
throw error;
}
}
else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
}
return response;
}
}
exports.BotDetectionMiddleware = BotDetectionMiddleware;
/**
* Stealth Mode Middleware
*/
class StealthMiddleware {
name = 'StealthMiddleware';
priority = 95;
async processRequest(request) {
// Flag that this request needs stealth mode
request.metadata.requiresStealth = true;
return request;
}
}
exports.StealthMiddleware = StealthMiddleware;
/**
* Middleware Engine to orchestrate all middlewares
*/
class MiddlewareEngine {
middlewares = [];
use(middleware) {
this.middlewares.push(middleware);
// Sort by priority (higher first)
this.middlewares.sort((a, b) => b.priority - a.priority);
}
async processRequest(request) {
let current = request;
for (const middleware of this.middlewares) {
if (middleware.processRequest) {
current = await middleware.processRequest(current);
}
}
return current;
}
async processResponse(response) {
let current = response;
for (const middleware of this.middlewares) {
if (middleware.processResponse) {
current = await middleware.processResponse(current);
}
}
return current;
}
async processError(error, request) {
let currentError = error;
for (const middleware of this.middlewares) {
if (middleware.processError && currentError) {
currentError = await middleware.processError(currentError, request);
if (currentError === null) {
// Middleware handled the error (e.g., retry)
break;
}
}
}
return currentError;
}
}
exports.MiddlewareEngine = MiddlewareEngine;

278
backend/dist/scraper-v2/navigation.js vendored Normal file
View File

@@ -0,0 +1,278 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.NavigationDiscovery = void 0;
const migrate_1 = require("../db/migrate");
const logger_1 = require("../services/logger");
/**
* Navigation Discovery - finds and builds category structure
*/
class NavigationDiscovery {
downloader;
constructor(downloader) {
this.downloader = downloader;
}
/**
* Discover categories from a store's main page
*/
async discoverCategories(storeId) {
logger_1.logger.info('categories', `Starting category discovery for store ${storeId}`);
try {
// Get store info
const storeResult = await migrate_1.pool.query(`
SELECT id, name, slug, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Create request to fetch the main page
const request = {
url: baseUrl,
priority: 100,
retryCount: 0,
maxRetries: 3,
metadata: {
requiresBrowser: true,
requiresStealth: true
},
callback: async () => ({ items: [], requests: [] })
};
// Fetch the page
const response = await this.downloader.fetch(request);
// Extract navigation links
const page = await this.downloader.getCurrentPage();
if (!page) {
throw new Error('No active page for navigation extraction');
}
const links = await this.extractNavigationLinks(page, baseUrl);
logger_1.logger.info('categories', `Found ${links.length} navigation links`);
// Check if it's a Dutchie menu
const isDutchie = await this.isDutchieMenu(page);
if (isDutchie) {
logger_1.logger.info('categories', 'Detected Dutchie menu - using predefined structure');
await this.createDutchieCategories(storeId, store, links);
}
else {
logger_1.logger.info('categories', 'Custom menu detected - extracting from navigation');
await this.createCustomCategories(storeId, store, links);
}
logger_1.logger.info('categories', `✅ Category discovery completed for ${store.name}`);
}
catch (error) {
logger_1.logger.error('categories', `Category discovery failed: ${error}`);
throw error;
}
}
/**
* Extract navigation links from page
*/
async extractNavigationLinks(page, baseUrl) {
return await page.evaluate((base) => {
const links = [];
// Look for navigation elements
const navSelectors = [
'nav a',
'[role="navigation"] a',
'[class*="nav"] a',
'[class*="menu"] a',
'[class*="category"] a',
'header a'
];
const foundLinks = new Set();
for (const selector of navSelectors) {
// @ts-ignore - runs in browser context
const elements = document.querySelectorAll(selector);
elements.forEach((el) => {
const text = el.textContent?.trim();
let href = el.href || el.getAttribute('href');
if (!text || !href || text.length < 2)
return;
// Normalize href
if (href.startsWith('/')) {
// @ts-ignore - runs in browser context
const url = new URL(base);
href = `${url.origin}${href}`;
}
// Skip external links and anchors
if (!href.includes(base) || href.includes('#'))
return;
// Skip duplicates
const linkKey = `${text}:${href}`;
if (foundLinks.has(linkKey))
return;
foundLinks.add(linkKey);
// Determine if it's likely a category
const categoryKeywords = [
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
'topical', 'accessory', 'brand', 'special', 'shop',
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
];
const isCategory = categoryKeywords.some(kw => text.toLowerCase().includes(kw) ||
href.toLowerCase().includes(kw));
links.push({
text,
href,
isCategory
});
});
}
return links;
}, baseUrl);
}
/**
* Check if it's a Dutchie menu
*/
async isDutchieMenu(page) {
return await page.evaluate(() => {
// Check for Dutchie markers
// @ts-ignore - runs in browser context
if (window.reactEnv) {
// @ts-ignore - runs in browser context
const env = window.reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// @ts-ignore - runs in browser context
const htmlContent = document.documentElement.innerHTML;
return (htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv'));
});
}
/**
* Create categories for Dutchie menus (predefined structure)
* Uses your existing Dutchie category structure
*/
async createDutchieCategories(storeId, store, discoveredLinks) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
// Your existing Dutchie categories structure
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
];
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
}
else {
// Top-level: /embedded-menu/{slug}/shop
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
}
finally {
client.release();
}
}
/**
* Create categories from discovered links (custom menus)
*/
async createCustomCategories(storeId, store, links) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
// Filter to likely category links
const categoryLinks = links.filter(link => link.isCategory);
let displayOrder = 0;
for (const link of categoryLinks) {
// Generate slug from text
const slug = link.text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
// Determine path from URL
const url = new URL(link.href);
const path = url.pathname.replace(/^\//, '');
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
logger_1.logger.info('categories', `📁 ${link.text} -> ${link.href}`);
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
/**
* Update display_order column in categories table
*/
async ensureDisplayOrderColumn() {
try {
await migrate_1.pool.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
`);
logger_1.logger.info('categories', 'Ensured display_order column exists');
}
catch (error) {
logger_1.logger.warn('categories', `Could not add display_order column: ${error}`);
}
}
}
exports.NavigationDiscovery = NavigationDiscovery;

300
backend/dist/scraper-v2/pipelines.js vendored Normal file
View File

@@ -0,0 +1,300 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = void 0;
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
/**
* Validation Pipeline - ensures data quality
*/
class ValidationPipeline {
name = 'ValidationPipeline';
priority = 100;
async process(item, spider) {
// Required fields
if (!item.name || item.name.trim().length < 2) {
logger_1.logger.warn('pipeline', `Dropping product: invalid name`);
return null;
}
if (!item.dutchieUrl) {
logger_1.logger.warn('pipeline', `Dropping product ${item.name}: no URL`);
return null;
}
// Validate numeric fields
if (item.price !== undefined && (item.price < 0 || item.price > 10000)) {
logger_1.logger.warn('pipeline', `Invalid price for ${item.name}: ${item.price}`);
item.price = undefined;
}
if (item.thcPercentage !== undefined && (item.thcPercentage < 0 || item.thcPercentage > 100)) {
logger_1.logger.warn('pipeline', `Invalid THC for ${item.name}: ${item.thcPercentage}`);
item.thcPercentage = undefined;
}
if (item.cbdPercentage !== undefined && (item.cbdPercentage < 0 || item.cbdPercentage > 100)) {
logger_1.logger.warn('pipeline', `Invalid CBD for ${item.name}: ${item.cbdPercentage}`);
item.cbdPercentage = undefined;
}
return item;
}
}
exports.ValidationPipeline = ValidationPipeline;
/**
* Sanitization Pipeline - cleans and normalizes data
*/
class SanitizationPipeline {
name = 'SanitizationPipeline';
priority = 90;
async process(item, spider) {
// Truncate long strings
if (item.name) {
item.name = item.name.substring(0, 500).trim();
}
if (item.description) {
item.description = item.description.substring(0, 5000).trim();
}
if (item.brand) {
item.brand = item.brand.substring(0, 255).trim();
}
if (item.weight) {
item.weight = item.weight.substring(0, 100).trim();
}
// Normalize strain type
if (item.strainType) {
const normalized = item.strainType.toLowerCase();
if (normalized.includes('indica')) {
item.strainType = 'Indica';
}
else if (normalized.includes('sativa')) {
item.strainType = 'Sativa';
}
else if (normalized.includes('hybrid')) {
item.strainType = 'Hybrid';
}
else {
item.strainType = undefined;
}
}
// Clean up metadata
if (item.metadata) {
// Remove empty arrays
Object.keys(item.metadata).forEach(key => {
if (Array.isArray(item.metadata[key]) && item.metadata[key].length === 0) {
delete item.metadata[key];
}
});
}
return item;
}
}
exports.SanitizationPipeline = SanitizationPipeline;
/**
* Deduplication Pipeline - prevents duplicate items
*/
class DeduplicationPipeline {
name = 'DeduplicationPipeline';
priority = 80;
seen = new Set();
async process(item, spider) {
const fingerprint = `${item.dutchieProductId}`;
if (this.seen.has(fingerprint)) {
logger_1.logger.debug('pipeline', `Duplicate product detected: ${item.name}`);
return null;
}
this.seen.add(fingerprint);
return item;
}
clear() {
this.seen.clear();
}
}
exports.DeduplicationPipeline = DeduplicationPipeline;
/**
* Image Processing Pipeline - handles image downloads
*/
class ImagePipeline {
name = 'ImagePipeline';
priority = 70;
extractImageId(url) {
try {
const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
return match ? match[1] : null;
}
catch (e) {
return null;
}
}
getFullSizeImageUrl(imageUrl) {
const imageId = this.extractImageId(imageUrl);
if (!imageId)
return imageUrl;
return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
}
async process(item, spider) {
if (item.imageUrl) {
// Convert to full-size URL
item.imageUrl = this.getFullSizeImageUrl(item.imageUrl);
}
return item;
}
}
exports.ImagePipeline = ImagePipeline;
/**
* Database Pipeline - saves items to database
*/
class DatabasePipeline {
name = 'DatabasePipeline';
priority = 10; // Low priority - runs last
async process(item, spider) {
const client = await migrate_1.pool.connect();
try {
// Extract store and category from metadata (set by spider)
const storeId = item.storeId;
const categoryId = item.categoryId;
if (!storeId || !categoryId) {
logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
return null;
}
// Check if product exists
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, item.name, categoryId]);
let localImagePath = null;
let productId;
if (existingResult.rows.length > 0) {
// Update existing product
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
await client.query(`
UPDATE products
SET name = $1, description = $2, price = $3,
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $12
`, [
item.name, item.description, item.price,
item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {}), productId
]);
logger_1.logger.debug('pipeline', `Updated product: ${item.name}`);
}
else {
// Insert new product
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
RETURNING id
`, [
storeId, categoryId, item.dutchieProductId, item.name, item.description,
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {})
]);
productId = insertResult.rows[0].id;
logger_1.logger.debug('pipeline', `Inserted new product: ${item.name}`);
}
// Download image if needed
if (item.imageUrl && !localImagePath) {
try {
localImagePath = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId);
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`);
}
catch (error) {
logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
}
}
return item;
}
catch (error) {
logger_1.logger.error('pipeline', `Failed to save product ${item.name}: ${error}`);
return null;
}
finally {
client.release();
}
}
}
exports.DatabasePipeline = DatabasePipeline;
/**
* Stats Pipeline - tracks statistics
*/
class StatsPipeline {
name = 'StatsPipeline';
priority = 50;
stats = {
total: 0,
withImages: 0,
withThc: 0,
withCbd: 0,
withDescription: 0
};
async process(item, spider) {
this.stats.total++;
if (item.imageUrl)
this.stats.withImages++;
if (item.thcPercentage)
this.stats.withThc++;
if (item.cbdPercentage)
this.stats.withCbd++;
if (item.description)
this.stats.withDescription++;
return item;
}
getStats() {
return { ...this.stats };
}
clear() {
this.stats = {
total: 0,
withImages: 0,
withThc: 0,
withCbd: 0,
withDescription: 0
};
}
}
exports.StatsPipeline = StatsPipeline;
/**
* Pipeline Engine - orchestrates all pipelines
*/
class PipelineEngine {
pipelines = [];
use(pipeline) {
this.pipelines.push(pipeline);
// Sort by priority (higher first)
this.pipelines.sort((a, b) => b.priority - a.priority);
}
async processItem(item, spider) {
let current = item;
for (const pipeline of this.pipelines) {
try {
current = await pipeline.process(current, spider);
if (!current) {
// Item was filtered out
logger_1.logger.debug('pipeline', `Item filtered by ${pipeline.name}`);
return null;
}
}
catch (error) {
logger_1.logger.error('pipeline', `Error in ${pipeline.name}: ${error}`);
// Continue with other pipelines
}
}
return current;
}
getPipeline(name) {
return this.pipelines.find(p => p.name === name);
}
}
exports.PipelineEngine = PipelineEngine;

136
backend/dist/scraper-v2/scheduler.js vendored Normal file
View File

@@ -0,0 +1,136 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.RequestScheduler = void 0;
const logger_1 = require("../services/logger");
const crypto_1 = __importDefault(require("crypto"));
class RequestScheduler {
queue = [];
inProgress = new Set();
seen = new Set();
deduplicationEnabled = true;
constructor(deduplicationEnabled = true) {
this.deduplicationEnabled = deduplicationEnabled;
}
/**
* Generate fingerprint for request deduplication
*/
generateFingerprint(request) {
if (request.fingerprint) {
return request.fingerprint;
}
// Generate fingerprint based on URL and relevant metadata
const data = {
url: request.url,
method: request.metadata?.method || 'GET',
body: request.metadata?.body
};
return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex');
}
/**
* Add a request to the queue
*/
enqueue(partialRequest) {
if (!partialRequest.url) {
logger_1.logger.warn('scraper', 'Cannot enqueue request without URL');
return false;
}
const fingerprint = this.generateFingerprint(partialRequest);
// Check for duplicates
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
return false;
}
// Create full request with defaults
const request = {
url: partialRequest.url,
priority: partialRequest.priority ?? 0,
retryCount: partialRequest.retryCount ?? 0,
maxRetries: partialRequest.maxRetries ?? 3,
metadata: partialRequest.metadata || {},
callback: partialRequest.callback,
errorHandler: partialRequest.errorHandler,
fingerprint
};
this.queue.push(request);
this.seen.add(fingerprint);
// Sort by priority (higher priority first)
this.queue.sort((a, b) => b.priority - a.priority);
logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
return true;
}
/**
* Get the next request from the queue
*/
dequeue() {
const request = this.queue.shift();
if (request) {
this.inProgress.add(request.fingerprint);
}
return request || null;
}
/**
* Mark a request as complete
*/
markComplete(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
}
}
/**
* Requeue a failed request (for retry)
*/
requeueForRetry(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
this.seen.delete(request.fingerprint);
}
request.retryCount++;
if (request.retryCount > request.maxRetries) {
logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
return false;
}
// Decrease priority for retried requests
request.priority = Math.max(0, request.priority - 1);
return this.enqueue(request);
}
/**
* Get queue stats
*/
getStats() {
return {
pending: this.queue.length,
inProgress: this.inProgress.size,
total: this.seen.size
};
}
/**
* Check if queue is empty
*/
isEmpty() {
return this.queue.length === 0 && this.inProgress.size === 0;
}
/**
* Clear all queues
*/
clear() {
this.queue = [];
this.inProgress.clear();
this.seen.clear();
}
/**
* Get pending requests count
*/
getPendingCount() {
return this.queue.length;
}
/**
* Get in-progress count
*/
getInProgressCount() {
return this.inProgress.size;
}
}
exports.RequestScheduler = RequestScheduler;

13
backend/dist/scraper-v2/types.js vendored Normal file
View File

@@ -0,0 +1,13 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ErrorType = void 0;
var ErrorType;
(function (ErrorType) {
ErrorType["NETWORK_ERROR"] = "NETWORK_ERROR";
ErrorType["TIMEOUT"] = "TIMEOUT";
ErrorType["PARSE_ERROR"] = "PARSE_ERROR";
ErrorType["BOT_DETECTION"] = "BOT_DETECTION";
ErrorType["NOT_FOUND"] = "NOT_FOUND";
ErrorType["SERVER_ERROR"] = "SERVER_ERROR";
ErrorType["UNKNOWN"] = "UNKNOWN";
})(ErrorType || (exports.ErrorType = ErrorType = {}));

View File

@@ -0,0 +1,168 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.discoverCategories = discoverCategories;
const puppeteer_1 = __importDefault(require("puppeteer"));
const migrate_1 = require("../db/migrate");
const logger_1 = require("./logger");
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop' },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands' },
{ name: 'Specials', slug: 'specials' }
];
async function makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
window.chrome = { runtime: {} };
});
}
async function isDutchieMenu(page) {
try {
// Check page source for Dutchie markers
const isDutchie = await page.evaluate(() => {
// Check for window.reactEnv with dutchie URLs
if (window.reactEnv) {
const env = window.reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// Check HTML source for dutchie references
const htmlContent = document.documentElement.innerHTML;
if (htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv')) {
return true;
}
return false;
});
return isDutchie;
}
catch (error) {
logger_1.logger.warn('categories', `Error detecting Dutchie menu: ${error}`);
return false;
}
}
async function discoverCategories(storeId) {
let browser = null;
try {
logger_1.logger.info('categories', `Discovering categories for store ID: ${storeId}`);
const storeResult = await migrate_1.pool.query(`
SELECT id, name, slug, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Launch browser to check page source
browser = await puppeteer_1.default.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
]
});
const page = await browser.newPage();
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
await page.waitForTimeout(3000);
// Detect if it's a Dutchie menu by inspecting page source
const isDutchie = await isDutchieMenu(page);
await browser.close();
browser = null;
if (isDutchie) {
logger_1.logger.info('categories', `✅ Detected Dutchie menu for ${store.name}`);
await createDutchieCategories(storeId, store);
}
else {
logger_1.logger.info('categories', `⚠️ Non-Dutchie menu detected, would need custom scraping logic`);
throw new Error('Non-Dutchie menus not yet supported. Please contact support.');
}
}
catch (error) {
logger_1.logger.error('categories', `Category discovery error: ${error}`);
if (browser)
await browser.close();
throw error;
}
}
async function createDutchieCategories(storeId, store) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
}
else {
// Top-level: /embedded-menu/{slug}/shop
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
}
finally {
client.release();
}
}

56
backend/dist/services/logger.js vendored Normal file
View File

@@ -0,0 +1,56 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.logger = void 0;
class LogService {
logs = [];
maxLogs = 1000;
log(level, category, message) {
const entry = {
timestamp: new Date(),
level,
category,
message
};
this.logs.unshift(entry);
if (this.logs.length > this.maxLogs) {
this.logs = this.logs.slice(0, this.maxLogs);
}
const timestamp = entry.timestamp.toISOString();
const prefix = `[${timestamp}] [${category.toUpperCase()}] [${level.toUpperCase()}]`;
if (level === 'error') {
console.error(prefix, message);
}
else if (level === 'warn') {
console.warn(prefix, message);
}
else {
console.log(prefix, message);
}
}
info(category, message) {
this.log('info', category, message);
}
error(category, message) {
this.log('error', category, message);
}
warn(category, message) {
this.log('warn', category, message);
}
debug(category, message) {
this.log('debug', category, message);
}
getLogs(limit = 100, level, category) {
let filtered = this.logs;
if (level) {
filtered = filtered.filter(log => log.level === level);
}
if (category) {
filtered = filtered.filter(log => log.category === category);
}
return filtered.slice(0, limit);
}
clear() {
this.logs = [];
}
}
exports.logger = new LogService();

166
backend/dist/services/proxy.js vendored Normal file
View File

@@ -0,0 +1,166 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.testProxy = testProxy;
exports.saveProxyTestResult = saveProxyTestResult;
exports.testAllProxies = testAllProxies;
exports.addProxy = addProxy;
exports.addProxiesFromList = addProxiesFromList;
const axios_1 = __importDefault(require("axios"));
const socks_proxy_agent_1 = require("socks-proxy-agent");
const https_proxy_agent_1 = require("https-proxy-agent");
const migrate_1 = require("../db/migrate");
async function getSettings() {
const result = await migrate_1.pool.query(`
SELECT key, value FROM settings
WHERE key IN ('proxy_timeout_ms', 'proxy_test_url')
`);
const settings = {};
result.rows.forEach(row => {
settings[row.key] = row.value;
});
return {
timeout: parseInt(settings.proxy_timeout_ms || '3000'),
testUrl: settings.proxy_test_url || 'https://httpbin.org/ip'
};
}
async function testProxy(host, port, protocol, username, password) {
try {
const { timeout, testUrl } = await getSettings();
const startTime = Date.now();
// Construct proxy URL
let proxyUrl;
if (username && password) {
proxyUrl = `${protocol}://${username}:${password}@${host}:${port}`;
}
else {
proxyUrl = `${protocol}://${host}:${port}`;
}
// Create appropriate agent based on protocol
let agent;
if (protocol === 'socks5' || protocol === 'socks') {
agent = new socks_proxy_agent_1.SocksProxyAgent(proxyUrl);
}
else if (protocol === 'http' || protocol === 'https') {
agent = new https_proxy_agent_1.HttpsProxyAgent(proxyUrl);
}
else {
return {
success: false,
error: `Unsupported protocol: ${protocol}`
};
}
// Make test request
const response = await axios_1.default.get(testUrl, {
httpAgent: agent,
httpsAgent: agent,
timeout,
});
const responseTimeMs = Date.now() - startTime;
// Check anonymity - the test URL should return our IP
// If it returns the proxy's IP, we're anonymous
let isAnonymous = false;
if (response.data && response.data.origin) {
// If the returned IP is different from our actual IP, the proxy is working
// For simplicity, we'll consider it anonymous if we get a response
isAnonymous = true;
}
return {
success: true,
responseTimeMs,
isAnonymous
};
}
catch (error) {
return {
success: false,
error: error.message || 'Unknown error'
};
}
}
async function saveProxyTestResult(proxyId, result) {
await migrate_1.pool.query(`
UPDATE proxies
SET last_tested_at = CURRENT_TIMESTAMP,
test_result = $1,
response_time_ms = $2,
is_anonymous = $3,
active = $4,
updated_at = CURRENT_TIMESTAMP
WHERE id = $5
`, [
result.success ? 'success' : 'failed',
result.responseTimeMs || null,
result.isAnonymous || false,
result.success,
proxyId
]);
}
async function testAllProxies() {
console.log('🔍 Testing all proxies...');
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, username, password
FROM proxies
`);
for (const proxy of result.rows) {
console.log(`Testing proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
const testResult = await testProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
await saveProxyTestResult(proxy.id, testResult);
if (testResult.success) {
console.log(`✅ Proxy OK (${testResult.responseTimeMs}ms, anonymous: ${testResult.isAnonymous})`);
}
else {
console.log(`❌ Proxy failed: ${testResult.error}`);
}
// Small delay between tests
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log('✅ Proxy testing complete');
}
async function addProxy(host, port, protocol, username, password) {
// Test the proxy first
const testResult = await testProxy(host, port, protocol, username, password);
if (!testResult.success) {
throw new Error(`Proxy test failed: ${testResult.error}`);
}
// Insert into database
const result = await migrate_1.pool.query(`
INSERT INTO proxies (host, port, protocol, username, password, active, is_anonymous, test_result, response_time_ms, last_tested_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, CURRENT_TIMESTAMP)
RETURNING id
`, [
host,
port,
protocol,
username,
password,
testResult.success,
testResult.isAnonymous,
'success',
testResult.responseTimeMs
]);
return result.rows[0].id;
}
async function addProxiesFromList(proxies) {
let added = 0;
let failed = 0;
const errors = [];
for (const proxy of proxies) {
try {
await addProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
added++;
console.log(`✅ Added proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
catch (error) {
failed++;
const errorMsg = `${proxy.host}:${proxy.port} - ${error.message}`;
errors.push(errorMsg);
console.log(`❌ Failed to add proxy: ${errorMsg}`);
}
// Small delay between adds
await new Promise(resolve => setTimeout(resolve, 500));
}
return { added, failed, errors };
}

104
backend/dist/services/scheduler.js vendored Normal file
View File

@@ -0,0 +1,104 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.startScheduler = startScheduler;
exports.stopScheduler = stopScheduler;
exports.restartScheduler = restartScheduler;
exports.triggerStoreScrape = triggerStoreScrape;
exports.triggerAllStoresScrape = triggerAllStoresScrape;
const node_cron_1 = __importDefault(require("node-cron"));
const migrate_1 = require("../db/migrate");
const scraper_v2_1 = require("../scraper-v2");
let scheduledJobs = [];
async function getSettings() {
const result = await migrate_1.pool.query(`
SELECT key, value FROM settings
WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
`);
const settings = {};
result.rows.forEach(row => {
settings[row.key] = row.value;
});
return {
scrapeIntervalHours: parseInt(settings.scrape_interval_hours || '4'),
scrapeSpecialsTime: settings.scrape_specials_time || '00:01'
};
}
async function scrapeAllStores() {
console.log('🔄 Starting scheduled scrape for all stores...');
const result = await migrate_1.pool.query(`
SELECT id, name FROM stores WHERE active = true AND scrape_enabled = true
`);
for (const store of result.rows) {
try {
console.log(`Scraping store: ${store.name}`);
await (0, scraper_v2_1.scrapeStore)(store.id);
}
catch (error) {
console.error(`Failed to scrape store ${store.name}:`, error);
}
}
console.log('✅ Scheduled scrape completed');
}
async function scrapeSpecials() {
console.log('🌟 Starting scheduled specials scrape...');
const result = await migrate_1.pool.query(`
SELECT s.id, s.name, c.id as category_id
FROM stores s
JOIN categories c ON c.store_id = s.id
WHERE s.active = true AND s.scrape_enabled = true
AND c.slug = 'specials' AND c.scrape_enabled = true
`);
for (const row of result.rows) {
try {
console.log(`Scraping specials for: ${row.name}`);
await (0, scraper_v2_1.scrapeCategory)(row.id, row.category_id);
}
catch (error) {
console.error(`Failed to scrape specials for ${row.name}:`, error);
}
}
console.log('✅ Specials scrape completed');
}
async function startScheduler() {
// Stop any existing jobs
stopScheduler();
const settings = await getSettings();
// Schedule regular store scrapes (every N hours)
const scrapeIntervalCron = `0 */${settings.scrapeIntervalHours} * * *`;
const storeJob = node_cron_1.default.schedule(scrapeIntervalCron, scrapeAllStores);
scheduledJobs.push(storeJob);
console.log(`📅 Scheduled store scraping: every ${settings.scrapeIntervalHours} hours`);
// Schedule specials scraping (daily at specified time)
const [hours, minutes] = settings.scrapeSpecialsTime.split(':');
const specialsCron = `${minutes} ${hours} * * *`;
const specialsJob = node_cron_1.default.schedule(specialsCron, scrapeSpecials);
scheduledJobs.push(specialsJob);
console.log(`📅 Scheduled specials scraping: daily at ${settings.scrapeSpecialsTime}`);
// Initial scrape on startup (after 10 seconds)
setTimeout(() => {
console.log('🚀 Running initial scrape...');
scrapeAllStores().catch(console.error);
}, 10000);
}
function stopScheduler() {
scheduledJobs.forEach(job => job.stop());
scheduledJobs = [];
console.log('🛑 Scheduler stopped');
}
async function restartScheduler() {
console.log('🔄 Restarting scheduler...');
stopScheduler();
await startScheduler();
}
// Manual trigger functions for admin
async function triggerStoreScrape(storeId) {
console.log(`🔧 Manual scrape triggered for store ID: ${storeId}`);
await (0, scraper_v2_1.scrapeStore)(storeId);
}
async function triggerAllStoresScrape() {
console.log('🔧 Manual scrape triggered for all stores');
await scrapeAllStores();
}

80
backend/dist/services/scraper-debug.js vendored Normal file
View File

@@ -0,0 +1,80 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.debugDutchiePage = debugDutchiePage;
const puppeteer_1 = __importDefault(require("puppeteer"));
const logger_1 = require("./logger");
async function debugDutchiePage(url) {
const browser = await puppeteer_1.default.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
logger_1.logger.info('scraper', `Loading: ${url}`);
try {
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
logger_1.logger.info('scraper', 'Page loaded, waiting for content...');
// Wait for content to render
await page.waitForTimeout(8000);
const debug = await page.evaluate(() => {
// Try to find product cards
const productSelectors = [
'[data-testid*="product"]',
'[class*="Product"]',
'[class*="product"]',
'article',
'[role="article"]',
'li'
];
const results = {
selectors: {}
};
for (const selector of productSelectors) {
const elements = document.querySelectorAll(selector);
results.selectors[selector] = elements.length;
}
// Get sample HTML from first few matches
const firstMatch = document.querySelector('[class*="product" i], article, [data-testid*="product"]');
if (firstMatch) {
results.sampleHTML = firstMatch.outerHTML.substring(0, 1000);
results.sampleText = firstMatch.textContent?.substring(0, 500);
}
// Get all class names that might be products
const allElements = document.querySelectorAll('*');
const classNames = new Set();
allElements.forEach(el => {
const classes = el.className;
if (typeof classes === 'string' && classes.toLowerCase().includes('product')) {
classes.split(' ').forEach(c => classNames.add(c));
}
});
results.productClasses = Array.from(classNames).slice(0, 20);
results.bodyTextSample = document.body.innerText.substring(0, 500);
return results;
});
logger_1.logger.info('scraper', `Debug results:\n${JSON.stringify(debug, null, 2)}`);
}
catch (error) {
logger_1.logger.error('scraper', `Debug navigation error: ${error}`);
// Try to get whatever we can
try {
const partialDebug = await page.evaluate(() => {
return {
url: window.location.href,
title: document.title,
bodyLength: document.body?.innerHTML?.length || 0,
bodyStart: document.body?.innerHTML?.substring(0, 500) || ''
};
});
logger_1.logger.info('scraper', `Partial debug:\n${JSON.stringify(partialDebug, null, 2)}`);
}
catch (e) {
logger_1.logger.error('scraper', `Could not get partial debug: ${e}`);
}
}
await browser.close();
}

587
backend/dist/services/scraper.js vendored Normal file
View File

@@ -0,0 +1,587 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeCategory = scrapeCategory;
exports.saveProducts = saveProducts;
exports.scrapeStore = scrapeStore;
const puppeteer_1 = __importDefault(require("puppeteer"));
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
const logger_1 = require("./logger");
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function extractImageIdFromUrl(url) {
try {
const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
return match ? match[1] : null;
}
catch (e) {
return null;
}
}
function getFullSizeImageUrl(imageUrl) {
const imageId = extractImageIdFromUrl(imageUrl);
if (!imageId)
return imageUrl;
return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
}
function sanitizeProductData(product) {
return {
...product,
name: product.name?.substring(0, 500) || 'Unnamed Product',
description: product.description || null,
brand: product.brand?.substring(0, 500) || null,
weight: product.weight?.substring(0, 100) || null,
thc: product.thc && product.thc < 100 ? product.thc : null,
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
};
}
async function getActiveProxy() {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
async function makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.evaluateOnNewDocument(() => {
window.chrome = {
runtime: {},
};
});
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
});
}
async function scrapeProductDetails(page, productUrl, productName) {
const maxRetries = 2;
let lastError = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 20000 });
await page.waitForTimeout(3000);
const details = await page.evaluate(() => {
const allText = document.body.textContent || '';
let fullSizeImage = null;
const mainImageSelectors = [
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
];
for (const sel of mainImageSelectors) {
const img = document.querySelector(sel);
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
break;
}
}
let description = '';
const descSelectors = [
'[class*="description"]',
'[class*="Description"]',
'[data-testid*="description"]',
'p[class*="product"]'
];
for (const sel of descSelectors) {
const el = document.querySelector(sel);
if (el?.textContent?.trim() && el.textContent.length > 20) {
description = el.textContent.trim();
break;
}
}
let thc = null;
const thcPatterns = [
/THC[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+THC/i
];
for (const pattern of thcPatterns) {
const match = allText.match(pattern);
if (match) {
thc = parseFloat(match[1]);
break;
}
}
let cbd = null;
const cbdPatterns = [
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+CBD/i
];
for (const pattern of cbdPatterns) {
const match = allText.match(pattern);
if (match) {
cbd = parseFloat(match[1]);
break;
}
}
let strainType = null;
if (allText.match(/\bindica\b/i))
strainType = 'Indica';
else if (allText.match(/\bsativa\b/i))
strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i))
strainType = 'Hybrid';
const terpenes = [];
const terpeneNames = [
'Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool',
'Humulene', 'Terpinolene', 'Ocimene', 'Bisabolol', 'Valencene'
];
terpeneNames.forEach(terp => {
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
terpenes.push(terp);
}
});
const effects = [];
const effectNames = [
'Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative',
'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry',
'Talkative', 'Giggly', 'Aroused'
];
effectNames.forEach(effect => {
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
effects.push(effect);
}
});
let brand = null;
const brandSelectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]'
];
for (const sel of brandSelectors) {
const el = document.querySelector(sel);
if (el?.textContent?.trim()) {
brand = el.textContent.trim();
break;
}
}
let lineage = null;
const lineageMatch = allText.match(/(?:Lineage|Genetics|Parents?)[:\s]*([^\n]+)/i);
if (lineageMatch) {
lineage = lineageMatch[1].trim();
}
const flavors = [];
const flavorNames = [
'Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel',
'Sour', 'Floral', 'Spicy', 'Woody', 'Tropical', 'Fruity',
'Vanilla', 'Mint', 'Cheese', 'Grape', 'Lemon', 'Orange'
];
flavorNames.forEach(flavor => {
if (allText.match(new RegExp(`\\b${flavor}\\b`, 'i'))) {
flavors.push(flavor);
}
});
const weights = [];
const weightMatches = allText.matchAll(/(\d+\.?\d*\s*(?:g|oz|mg|gram))/gi);
for (const match of weightMatches) {
const weight = match[1].trim();
if (!weights.includes(weight)) {
weights.push(weight);
}
}
return {
fullSizeImage,
description,
thc,
cbd,
strainType,
terpenes,
effects,
brand,
lineage,
flavors,
weights
};
});
return details;
}
catch (error) {
lastError = error;
logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
if (attempt < maxRetries) {
await page.waitForTimeout(2000);
}
}
}
logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
return {
fullSizeImage: null,
description: null,
thc: null,
cbd: null,
strainType: null,
terpenes: [],
effects: [],
brand: null,
lineage: null,
flavors: [],
weights: []
};
}
async function scrapeCategory(storeId, categoryId) {
let browser = null;
try {
const categoryResult = await migrate_1.pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
FROM categories c
JOIN stores s ON c.store_id = s.id
WHERE c.id = $1
`, [categoryId]);
if (categoryResult.rows.length === 0) {
throw new Error('Category not found');
}
const category = categoryResult.rows[0];
logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
const proxy = await getActiveProxy();
const launchOptions = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080'
]
};
if (proxy) {
if (proxy.protocol === 'socks5') {
launchOptions.args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
}
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
launchOptions.args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
browser = await puppeteer_1.default.launch(launchOptions);
const page = await browser.newPage();
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent(getRandomUserAgent());
logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
try {
await page.goto(category.dutchie_url, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await page.waitForTimeout(5000);
logger_1.logger.info('scraper', 'Scrolling to load all products...');
await autoScroll(page);
await page.waitForTimeout(3000);
}
catch (navError) {
logger_1.logger.error('scraper', `Navigation error: ${navError}`);
throw navError;
}
logger_1.logger.info('scraper', 'Extracting product list from page...');
const products = await page.evaluate(() => {
const items = [];
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
console.log(`Found ${cards.length} product cards`);
cards.forEach((card) => {
try {
const allText = card.textContent || '';
let name = '';
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
for (const sel of nameSelectors) {
const el = card.querySelector(sel);
if (el?.textContent?.trim()) {
name = el.textContent.trim();
name = name.split('\n')[0].trim();
break;
}
}
if (!name || name.length < 2)
return;
let price = null;
let originalPrice = null;
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
price = parseFloat(priceMatches[0].replace('$', ''));
if (priceMatches.length > 1) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
const linkEl = card.querySelector('a[href*="/product/"]');
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
href = 'https://dutchie.com' + href;
}
items.push({
name,
price,
originalPrice,
href: href || window.location.href
});
}
catch (err) {
console.error('Error parsing product card:', err);
}
});
return items;
});
logger_1.logger.info('scraper', `Found ${products.length} products total`);
logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
let successCount = 0;
let failCount = 0;
for (let i = 0; i < products.length; i++) {
const product = products[i];
try {
logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
if (!product.href) {
logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
product.metadata = {};
failCount++;
continue;
}
const details = await scrapeProductDetails(page, product.href, product.name);
product.imageUrl = details.fullSizeImage ? getFullSizeImageUrl(details.fullSizeImage) : null;
product.description = details.description;
product.thc = details.thc;
product.cbd = details.cbd;
product.strainType = details.strainType;
product.brand = details.brand;
product.weight = details.weights.length > 0 ? details.weights[0] : null;
product.metadata = {
terpenes: details.terpenes,
effects: details.effects,
lineage: details.lineage,
flavors: details.flavors,
allWeights: details.weights
};
if (details.thc || details.cbd || details.description) {
logger_1.logger.info('scraper', ` ✓ THC: ${details.thc}%, CBD: ${details.cbd}%`);
successCount++;
}
else {
logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
failCount++;
}
await page.waitForTimeout(1500);
}
catch (error) {
logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
product.metadata = {};
failCount++;
}
}
await browser.close();
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
logger_1.logger.info('scraper', `✅ Category complete: ${category.name}`);
logger_1.logger.info('scraper', ` Total products: ${products.length}`);
logger_1.logger.info('scraper', ` Success: ${successCount}`);
logger_1.logger.info('scraper', ` Failed: ${failCount}`);
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
await migrate_1.pool.query(`
UPDATE categories
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
const formattedProducts = products.map((p, index) => {
const sanitized = sanitizeProductData(p);
return {
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
name: sanitized.name,
description: sanitized.description,
price: p.price,
originalPrice: p.originalPrice,
thcPercentage: sanitized.thc,
cbdPercentage: sanitized.cbd,
strainType: p.strainType,
brand: sanitized.brand,
weight: sanitized.weight,
imageUrl: p.imageUrl,
dutchieUrl: p.href,
metadata: p.metadata || {}
};
});
return formattedProducts;
}
catch (error) {
logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
if (browser) {
try {
await browser.close();
}
catch (e) {
logger_1.logger.error('scraper', `Error closing browser: ${e}`);
}
}
throw error;
}
}
async function autoScroll(page) {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 200);
});
});
}
async function saveProducts(storeId, categoryId, products) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
await client.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
`, [storeId, categoryId]);
for (const product of products) {
try {
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, product.name, categoryId]);
let localImagePath = null;
let productId;
if (existingResult.rows.length > 0) {
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
await client.query(`
UPDATE products
SET name = $1, description = $2, price = $3,
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $12
`, [
product.name, product.description, product.price,
product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata), productId
]);
}
else {
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
RETURNING id
`, [
storeId, categoryId, product.dutchieProductId, product.name, product.description,
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata)
]);
productId = insertResult.rows[0].id;
}
if (product.imageUrl && !localImagePath) {
try {
localImagePath = await (0, minio_1.uploadImageFromUrl)(product.imageUrl, productId);
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
}
catch (error) {
logger_1.logger.error('images', `Failed to download image for ${product.name}: ${error}`);
}
}
}
catch (productError) {
logger_1.logger.error('scraper', `Failed to save product ${product.name}: ${productError}`);
}
}
await client.query('COMMIT');
logger_1.logger.info('scraper', `✅ Saved ${products.length} products successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('scraper', `Error saving products: ${error}`);
throw error;
}
finally {
client.release();
}
}
async function scrapeStore(storeId) {
try {
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId}`);
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name, c.slug, c.dutchie_url
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
AND NOT EXISTS (
SELECT 1 FROM categories child
WHERE child.parent_id = c.id
)
ORDER BY c.display_order, c.name
`, [storeId]);
logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
for (const category of categoriesResult.rows) {
try {
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
const products = await scrapeCategory(storeId, category.id);
await saveProducts(storeId, category.id, products);
logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
}
catch (error) {
logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
}
await new Promise(resolve => setTimeout(resolve, 5000));
}
await migrate_1.pool.query(`
UPDATE stores
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [storeId]);
logger_1.logger.info('scraper', `🎉 Store scrape completed: ID ${storeId}`);
}
catch (error) {
logger_1.logger.error('scraper', `❌ Store scrape failed: ${error}`);
throw error;
}
}

122
backend/dist/utils/minio.js vendored Normal file
View File

@@ -0,0 +1,122 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.minioClient = void 0;
exports.initializeMinio = initializeMinio;
exports.uploadImageFromUrl = uploadImageFromUrl;
exports.getImageUrl = getImageUrl;
exports.deleteImage = deleteImage;
const Minio = __importStar(require("minio"));
const axios_1 = __importDefault(require("axios"));
const uuid_1 = require("uuid");
const minioClient = new Minio.Client({
endPoint: process.env.MINIO_ENDPOINT || 'minio',
port: parseInt(process.env.MINIO_PORT || '9000'),
useSSL: process.env.MINIO_USE_SSL === 'true',
accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
});
exports.minioClient = minioClient;
const BUCKET_NAME = process.env.MINIO_BUCKET || 'dutchie';
async function initializeMinio() {
try {
// Check if bucket exists
const exists = await minioClient.bucketExists(BUCKET_NAME);
if (!exists) {
// Create bucket
await minioClient.makeBucket(BUCKET_NAME, 'us-east-1');
console.log(`✅ Minio bucket created: ${BUCKET_NAME}`);
// Set public read policy
const policy = {
Version: '2012-10-17',
Statement: [
{
Effect: 'Allow',
Principal: { AWS: ['*'] },
Action: ['s3:GetObject'],
Resource: [`arn:aws:s3:::${BUCKET_NAME}/*`],
},
],
};
await minioClient.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
console.log(`✅ Bucket policy set to public read`);
}
else {
console.log(`✅ Minio bucket already exists: ${BUCKET_NAME}`);
}
}
catch (error) {
console.error('❌ Minio initialization error:', error);
throw error;
}
}
async function uploadImageFromUrl(imageUrl, productId) {
try {
// Download image
const response = await axios_1.default.get(imageUrl, { responseType: 'arraybuffer' });
const buffer = Buffer.from(response.data);
// Generate unique filename
const ext = imageUrl.split('.').pop()?.split('?')[0] || 'jpg';
const filename = `products/${productId}-${(0, uuid_1.v4)()}.${ext}`;
// Get content type
const contentType = response.headers['content-type'] || 'image/jpeg';
// Upload to Minio
await minioClient.putObject(BUCKET_NAME, filename, buffer, buffer.length, {
'Content-Type': contentType,
});
// Return the path (URL will be constructed when serving)
return filename;
}
catch (error) {
console.error('Error uploading image:', error);
throw error;
}
}
function getImageUrl(path) {
// Use localhost:9020 for browser access since Minio is exposed on host port 9020
const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
return `${endpoint}/${BUCKET_NAME}/${path}`;
}
async function deleteImage(path) {
try {
await minioClient.removeObject(BUCKET_NAME, path);
}
catch (error) {
console.error('Error deleting image:', error);
}
}