feat(images): Add local image storage with on-demand resizing

- Store product images locally with hierarchy: /images/products/<state>/<store>/<brand>/<product>/
- Add /img/* proxy endpoint for on-demand resizing via Sharp
- Implement per-product image checking to skip existing downloads
- Fix pathToUrl() to correctly generate /images/... URLs
- Add frontend getImageUrl() helper with preset sizes (thumb, medium, large)
- Update all product pages to use optimized image URLs
- Add stealth session support for Dutchie GraphQL crawls
- Include test scripts for crawl and image verification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-09 11:04:42 -07:00
parent aa776226b0
commit 91efd1d03d
28 changed files with 2027 additions and 205 deletions

View File

@@ -16,6 +16,12 @@ import {
NormalizedBrand,
NormalizationResult,
} from './types';
import {
downloadProductImage,
ProductImageContext,
isImageStorageReady,
LocalImageSizes,
} from '../utils/image-storage';
const BATCH_SIZE = 100;
@@ -23,10 +29,21 @@ const BATCH_SIZE = 100;
// PRODUCT UPSERTS
// ============================================================
export interface NewProductInfo {
id: number; // store_products.id
externalProductId: string; // provider_product_id
name: string;
brandName: string | null;
primaryImageUrl: string | null;
hasLocalImage?: boolean; // True if local_image_path is already set
}
export interface UpsertProductsResult {
upserted: number;
new: number;
updated: number;
newProducts: NewProductInfo[]; // Details of newly created products
productsNeedingImages: NewProductInfo[]; // Products (new or updated) that need image downloads
}
/**
@@ -41,12 +58,14 @@ export async function upsertStoreProducts(
options: { dryRun?: boolean } = {}
): Promise<UpsertProductsResult> {
if (products.length === 0) {
return { upserted: 0, new: 0, updated: 0 };
return { upserted: 0, new: 0, updated: 0, newProducts: [], productsNeedingImages: [] };
}
const { dryRun = false } = options;
let newCount = 0;
let updatedCount = 0;
const newProducts: NewProductInfo[] = [];
const productsNeedingImages: NewProductInfo[] = [];
// Process in batches
for (let i = 0; i < products.length; i += BATCH_SIZE) {
@@ -104,7 +123,7 @@ export async function upsertStoreProducts(
image_url = EXCLUDED.image_url,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) as is_new`,
RETURNING id, (xmax = 0) as is_new, (local_image_path IS NOT NULL) as has_local_image`,
[
product.dispensaryId,
product.platform,
@@ -129,10 +148,30 @@ export async function upsertStoreProducts(
]
);
if (result.rows[0]?.is_new) {
const row = result.rows[0];
const productInfo: NewProductInfo = {
id: row.id,
externalProductId: product.externalProductId,
name: product.name,
brandName: product.brandName,
primaryImageUrl: product.primaryImageUrl,
hasLocalImage: row.has_local_image,
};
if (row.is_new) {
newCount++;
// Track new products
newProducts.push(productInfo);
// New products always need images (if they have a source URL)
if (product.primaryImageUrl && !row.has_local_image) {
productsNeedingImages.push(productInfo);
}
} else {
updatedCount++;
// Updated products need images only if they don't have a local image yet
if (product.primaryImageUrl && !row.has_local_image) {
productsNeedingImages.push(productInfo);
}
}
}
@@ -149,6 +188,8 @@ export async function upsertStoreProducts(
upserted: newCount + updatedCount,
new: newCount,
updated: updatedCount,
newProducts,
productsNeedingImages,
};
}
@@ -564,6 +605,19 @@ export async function upsertBrands(
// FULL HYDRATION
// ============================================================
export interface ImageDownloadResult {
downloaded: number;
skipped: number;
failed: number;
bytesTotal: number;
}
export interface DispensaryContext {
stateCode: string;
storeSlug: string;
hasExistingProducts?: boolean; // True if store already has products with local images
}
export interface HydratePayloadResult {
productsUpserted: number;
productsNew: number;
@@ -574,6 +628,154 @@ export interface HydratePayloadResult {
variantsUpserted: number;
variantsNew: number;
variantSnapshotsCreated: number;
imagesDownloaded: number;
imagesSkipped: number;
imagesFailed: number;
imagesBytesTotal: number;
}
/**
* Helper to create slug from string
*/
function slugify(str: string): string {
return str
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 50) || 'unknown';
}
/**
* Download images for new products and update their local paths
*/
export async function downloadProductImages(
pool: Pool,
newProducts: NewProductInfo[],
dispensaryContext: DispensaryContext,
options: { dryRun?: boolean; concurrency?: number } = {}
): Promise<ImageDownloadResult> {
const { dryRun = false, concurrency = 5 } = options;
// Filter products that have images to download
const productsWithImages = newProducts.filter(p => p.primaryImageUrl);
if (productsWithImages.length === 0) {
return { downloaded: 0, skipped: 0, failed: 0, bytesTotal: 0 };
}
// Check if image storage is ready
if (!isImageStorageReady()) {
console.warn('[ImageDownload] Image storage not initialized, skipping downloads');
return { downloaded: 0, skipped: productsWithImages.length, failed: 0, bytesTotal: 0 };
}
if (dryRun) {
console.log(`[DryRun] Would download ${productsWithImages.length} images`);
return { downloaded: 0, skipped: productsWithImages.length, failed: 0, bytesTotal: 0 };
}
let downloaded = 0;
let skipped = 0;
let failed = 0;
let bytesTotal = 0;
// Process in batches with concurrency limit
for (let i = 0; i < productsWithImages.length; i += concurrency) {
const batch = productsWithImages.slice(i, i + concurrency);
const results = await Promise.allSettled(
batch.map(async (product) => {
const ctx: ProductImageContext = {
stateCode: dispensaryContext.stateCode,
storeSlug: dispensaryContext.storeSlug,
brandSlug: slugify(product.brandName || 'unknown'),
productId: product.externalProductId,
};
const result = await downloadProductImage(product.primaryImageUrl!, ctx, { skipIfExists: true });
if (result.success) {
// Update the database with local image path
const imagesJson = JSON.stringify({
full: result.urls!.full,
medium: result.urls!.medium,
thumb: result.urls!.thumb,
});
await pool.query(
`UPDATE store_products
SET local_image_path = $1, images = $2
WHERE id = $3`,
[result.urls!.full, imagesJson, product.id]
);
}
return result;
})
);
for (const result of results) {
if (result.status === 'fulfilled') {
const downloadResult = result.value;
if (downloadResult.success) {
if (downloadResult.skipped) {
skipped++;
} else {
downloaded++;
bytesTotal += downloadResult.bytesDownloaded || 0;
}
} else {
failed++;
console.warn(`[ImageDownload] Failed: ${downloadResult.error}`);
}
} else {
failed++;
console.error(`[ImageDownload] Error:`, result.reason);
}
}
}
console.log(`[ImageDownload] Downloaded: ${downloaded}, Skipped: ${skipped}, Failed: ${failed}, Bytes: ${bytesTotal}`);
return { downloaded, skipped, failed, bytesTotal };
}
/**
* Get dispensary context for image paths
* Also checks if this dispensary already has products with local images
* to skip unnecessary filesystem checks for existing stores
*/
async function getDispensaryContext(pool: Pool, dispensaryId: number): Promise<DispensaryContext | null> {
try {
const result = await pool.query(
`SELECT
d.state,
d.slug,
d.name,
EXISTS(
SELECT 1 FROM store_products sp
WHERE sp.dispensary_id = d.id
AND sp.local_image_path IS NOT NULL
LIMIT 1
) as has_local_images
FROM dispensaries d
WHERE d.id = $1`,
[dispensaryId]
);
if (result.rows.length === 0) {
return null;
}
const row = result.rows[0];
return {
stateCode: row.state || 'unknown',
storeSlug: row.slug || slugify(row.name || `store-${dispensaryId}`),
hasExistingProducts: row.has_local_images,
};
} catch (error) {
console.error('[getDispensaryContext] Error:', error);
return null;
}
}
/**
@@ -584,9 +786,9 @@ export async function hydrateToCanonical(
dispensaryId: number,
normResult: NormalizationResult,
crawlRunId: number | null,
options: { dryRun?: boolean } = {}
options: { dryRun?: boolean; downloadImages?: boolean } = {}
): Promise<HydratePayloadResult> {
const { dryRun = false } = options;
const { dryRun = false, downloadImages: shouldDownloadImages = true } = options;
// 1. Upsert brands
const brandResult = await upsertBrands(pool, normResult.brands, { dryRun });
@@ -634,6 +836,36 @@ export async function hydrateToCanonical(
{ dryRun }
);
// 6. Download images for products that need them
// This includes:
// - New products (always need images)
// - Updated products that don't have local images yet (backfill)
// This avoids:
// - Filesystem checks for products that already have local images
// - Unnecessary HTTP requests for products with existing images
let imageResult: ImageDownloadResult = { downloaded: 0, skipped: 0, failed: 0, bytesTotal: 0 };
if (shouldDownloadImages && productResult.productsNeedingImages.length > 0) {
const dispensaryContext = await getDispensaryContext(pool, dispensaryId);
if (dispensaryContext) {
const newCount = productResult.productsNeedingImages.filter(p => !p.hasLocalImage).length;
const backfillCount = productResult.productsNeedingImages.length - newCount;
console.log(`[Hydration] Downloading images for ${productResult.productsNeedingImages.length} products (${productResult.new} new, ${backfillCount} backfill)...`);
imageResult = await downloadProductImages(
pool,
productResult.productsNeedingImages,
dispensaryContext,
{ dryRun }
);
} else {
console.warn(`[Hydration] Could not get dispensary context for ID ${dispensaryId}, skipping image downloads`);
}
} else if (productResult.productsNeedingImages.length === 0 && productResult.upserted > 0) {
// All products already have local images
console.log(`[Hydration] All ${productResult.upserted} products already have local images, skipping downloads`);
}
return {
productsUpserted: productResult.upserted,
productsNew: productResult.new,
@@ -644,5 +876,9 @@ export async function hydrateToCanonical(
variantsUpserted: variantResult.upserted,
variantsNew: variantResult.new,
variantSnapshotsCreated: variantResult.snapshotsCreated,
imagesDownloaded: imageResult.downloaded,
imagesSkipped: imageResult.skipped,
imagesFailed: imageResult.failed,
imagesBytesTotal: imageResult.bytesTotal,
};
}

View File

@@ -7,6 +7,7 @@ import { initializeImageStorage } from './utils/image-storage';
import { logger } from './services/logger';
import { cleanupOrphanedJobs } from './services/proxyTestQueue';
import healthRoutes from './routes/health';
import imageProxyRoutes from './routes/image-proxy';
dotenv.config();
@@ -29,6 +30,10 @@ app.use(express.json());
const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || './public/images';
app.use('/images', express.static(LOCAL_IMAGES_PATH));
// Image proxy with on-demand resizing
// Usage: /img/products/az/store/brand/product/image.webp?w=200&h=200
app.use('/img', imageProxyRoutes);
// Serve static downloads (plugin files, etc.)
// Uses ./public/downloads relative to working directory (works for both Docker and local dev)
const LOCAL_DOWNLOADS_PATH = process.env.LOCAL_DOWNLOADS_PATH || './public/downloads';

View File

@@ -213,7 +213,24 @@ const FINGERPRINTS: Fingerprint[] = [
let currentFingerprintIndex = 0;
// Forward declaration for session (actual CrawlSession interface defined later)
let currentSession: {
sessionId: string;
fingerprint: Fingerprint;
proxyUrl: string | null;
stateCode?: string;
timezone?: string;
startedAt: Date;
} | null = null;
/**
* Get current fingerprint - returns session fingerprint if active, otherwise default
*/
export function getFingerprint(): Fingerprint {
// Use session fingerprint if a session is active
if (currentSession) {
return currentSession.fingerprint;
}
return FINGERPRINTS[currentFingerprintIndex];
}
@@ -228,6 +245,103 @@ export function resetFingerprint(): void {
currentFingerprintIndex = 0;
}
/**
* Get a random fingerprint from the pool
*/
export function getRandomFingerprint(): Fingerprint {
const index = Math.floor(Math.random() * FINGERPRINTS.length);
return FINGERPRINTS[index];
}
// ============================================================
// SESSION MANAGEMENT
// Per-session fingerprint rotation for stealth
// ============================================================
export interface CrawlSession {
sessionId: string;
fingerprint: Fingerprint;
proxyUrl: string | null;
stateCode?: string;
timezone?: string;
startedAt: Date;
}
// Note: currentSession variable declared earlier in file for proper scoping
/**
* Timezone to Accept-Language mapping
* US timezones all use en-US but this can be extended for international
*/
const TIMEZONE_TO_LOCALE: Record<string, string> = {
'America/Phoenix': 'en-US,en;q=0.9',
'America/Los_Angeles': 'en-US,en;q=0.9',
'America/Denver': 'en-US,en;q=0.9',
'America/Chicago': 'en-US,en;q=0.9',
'America/New_York': 'en-US,en;q=0.9',
'America/Detroit': 'en-US,en;q=0.9',
'America/Anchorage': 'en-US,en;q=0.9',
'Pacific/Honolulu': 'en-US,en;q=0.9',
};
/**
* Get Accept-Language header for a given timezone
*/
export function getLocaleForTimezone(timezone?: string): string {
if (!timezone) return 'en-US,en;q=0.9';
return TIMEZONE_TO_LOCALE[timezone] || 'en-US,en;q=0.9';
}
/**
* Start a new crawl session with a random fingerprint
* Call this before crawling a store to get a fresh identity
*/
export function startSession(stateCode?: string, timezone?: string): CrawlSession {
const baseFp = getRandomFingerprint();
// Override Accept-Language based on timezone for geographic consistency
const fingerprint: Fingerprint = {
...baseFp,
acceptLanguage: getLocaleForTimezone(timezone),
};
currentSession = {
sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
fingerprint,
proxyUrl: currentProxy,
stateCode,
timezone,
startedAt: new Date(),
};
console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`);
console.log(`[Dutchie Client] Fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
console.log(`[Dutchie Client] Accept-Language: ${fingerprint.acceptLanguage}`);
if (timezone) {
console.log(`[Dutchie Client] Timezone: ${timezone}`);
}
return currentSession;
}
/**
* End the current crawl session
*/
export function endSession(): void {
if (currentSession) {
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
console.log(`[Dutchie Client] Ended session ${currentSession.sessionId} (${duration}s)`);
currentSession = null;
}
}
/**
* Get current active session
*/
export function getCurrentSession(): CrawlSession | null {
return currentSession;
}
// ============================================================
// CURL HTTP CLIENT
// ============================================================

View File

@@ -18,6 +18,13 @@ export {
getFingerprint,
rotateFingerprint,
resetFingerprint,
getRandomFingerprint,
getLocaleForTimezone,
// Session Management (per-store fingerprint rotation)
startSession,
endSession,
getCurrentSession,
// Proxy
setProxy,
@@ -32,6 +39,7 @@ export {
// Types
type CurlResponse,
type Fingerprint,
type CrawlSession,
type ExecuteGraphQLOptions,
type FetchPageOptions,
} from './client';

View File

@@ -8,10 +8,12 @@ router.use(authMiddleware);
// Valid menu_type values
const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown'];
// Get all dispensaries
// Get all dispensaries (with pagination)
router.get('/', async (req, res) => {
try {
const { menu_type, city, state, crawl_enabled, dutchie_verified } = req.query;
const { menu_type, city, state, crawl_enabled, dutchie_verified, limit, offset, search } = req.query;
const pageLimit = Math.min(parseInt(limit as string) || 50, 500);
const pageOffset = parseInt(offset as string) || 0;
let query = `
SELECT
@@ -98,15 +100,34 @@ router.get('/', async (req, res) => {
}
}
if (conditions.length > 0) {
query += ` WHERE ${conditions.join(' AND ')}`;
// Search filter (name, dba_name, city, company_name)
if (search) {
conditions.push(`(name ILIKE $${params.length + 1} OR dba_name ILIKE $${params.length + 1} OR city ILIKE $${params.length + 1})`);
params.push(`%${search}%`);
}
// Build WHERE clause
const whereClause = conditions.length > 0 ? ` WHERE ${conditions.join(' AND ')}` : '';
// Get total count first
const countResult = await pool.query(`SELECT COUNT(*) FROM dispensaries${whereClause}`, params);
const total = parseInt(countResult.rows[0].count);
// Add pagination
query += whereClause;
query += ` ORDER BY name`;
query += ` LIMIT $${params.length + 1} OFFSET $${params.length + 2}`;
params.push(pageLimit, pageOffset);
const result = await pool.query(query, params);
res.json({ dispensaries: result.rows, total: result.rowCount });
res.json({
dispensaries: result.rows,
total,
limit: pageLimit,
offset: pageOffset,
hasMore: pageOffset + result.rows.length < total
});
} catch (error) {
console.error('Error fetching dispensaries:', error);
res.status(500).json({ error: 'Failed to fetch dispensaries' });

View File

@@ -0,0 +1,214 @@
/**
* Image Proxy Route
*
* On-demand image resizing service. Serves images with URL-based transforms.
*
* Usage:
* /img/<path>?w=200&h=200&q=80&fit=cover
*
* Parameters:
* w - width (pixels)
* h - height (pixels)
* q - quality (1-100, default 80)
* fit - resize fit: cover, contain, fill, inside, outside (default: inside)
* blur - blur sigma (0.3-1000)
* gray - grayscale (1 = enabled)
* format - output format: webp, jpeg, png, avif (default: webp)
*
* Examples:
* /img/products/az/store/brand/product/image.webp?w=200
* /img/products/az/store/brand/product/image.webp?w=600&h=400&fit=cover
* /img/products/az/store/brand/product/image.webp?w=100&blur=5&gray=1
*/
import { Router, Request, Response } from 'express';
import * as fs from 'fs/promises';
import * as path from 'path';
// @ts-ignore
const sharp = require('sharp');
const router = Router();
// Base path for images
function getImagesBasePath(): string {
if (process.env.IMAGES_PATH) {
return process.env.IMAGES_PATH;
}
if (process.env.STORAGE_BASE_PATH) {
return path.join(process.env.STORAGE_BASE_PATH, 'images');
}
return './storage/images';
}
const IMAGES_BASE_PATH = getImagesBasePath();
// Allowed fit modes
const ALLOWED_FITS = ['cover', 'contain', 'fill', 'inside', 'outside'] as const;
type FitMode = typeof ALLOWED_FITS[number];
// Allowed formats
const ALLOWED_FORMATS = ['webp', 'jpeg', 'jpg', 'png', 'avif'] as const;
type OutputFormat = typeof ALLOWED_FORMATS[number];
// Cache headers (1 year for immutable content-addressed images)
const CACHE_MAX_AGE = 31536000; // 1 year in seconds
interface TransformParams {
width?: number;
height?: number;
quality: number;
fit: FitMode;
blur?: number;
grayscale: boolean;
format: OutputFormat;
}
function parseTransformParams(query: any): TransformParams {
return {
width: query.w ? Math.min(Math.max(parseInt(query.w, 10), 1), 4000) : undefined,
height: query.h ? Math.min(Math.max(parseInt(query.h, 10), 1), 4000) : undefined,
quality: query.q ? Math.min(Math.max(parseInt(query.q, 10), 1), 100) : 80,
fit: ALLOWED_FITS.includes(query.fit) ? query.fit : 'inside',
blur: query.blur ? Math.min(Math.max(parseFloat(query.blur), 0.3), 1000) : undefined,
grayscale: query.gray === '1' || query.grayscale === '1',
format: ALLOWED_FORMATS.includes(query.format) ? query.format : 'webp',
};
}
function getContentType(format: OutputFormat): string {
switch (format) {
case 'jpeg':
case 'jpg':
return 'image/jpeg';
case 'png':
return 'image/png';
case 'avif':
return 'image/avif';
case 'webp':
default:
return 'image/webp';
}
}
/**
* Image proxy endpoint
* GET /img/*
*/
router.get('/*', async (req: Request, res: Response) => {
try {
// Get the image path from URL (everything after /img/)
const imagePath = req.params[0];
if (!imagePath) {
return res.status(400).json({ error: 'Image path required' });
}
// Security: prevent directory traversal
const normalizedPath = path.normalize(imagePath).replace(/^(\.\.(\/|\\|$))+/, '');
const basePath = path.resolve(IMAGES_BASE_PATH);
const fullPath = path.resolve(path.join(IMAGES_BASE_PATH, normalizedPath));
// Ensure path is within base directory
if (!fullPath.startsWith(basePath)) {
console.error(`[ImageProxy] Path traversal attempt: ${fullPath} not in ${basePath}`);
return res.status(403).json({ error: 'Access denied' });
}
// Check if file exists
try {
await fs.access(fullPath);
} catch {
return res.status(404).json({ error: 'Image not found' });
}
// Parse transform parameters
const params = parseTransformParams(req.query);
// Check if any transforms are requested
const hasTransforms = params.width || params.height || params.blur || params.grayscale;
// Read the original image
const imageBuffer = await fs.readFile(fullPath);
let outputBuffer: Buffer;
if (hasTransforms) {
// Apply transforms
let pipeline = sharp(imageBuffer);
// Resize
if (params.width || params.height) {
pipeline = pipeline.resize(params.width, params.height, {
fit: params.fit,
withoutEnlargement: true,
});
}
// Blur
if (params.blur) {
pipeline = pipeline.blur(params.blur);
}
// Grayscale
if (params.grayscale) {
pipeline = pipeline.grayscale();
}
// Output format
switch (params.format) {
case 'jpeg':
case 'jpg':
pipeline = pipeline.jpeg({ quality: params.quality });
break;
case 'png':
pipeline = pipeline.png({ quality: params.quality });
break;
case 'avif':
pipeline = pipeline.avif({ quality: params.quality });
break;
case 'webp':
default:
pipeline = pipeline.webp({ quality: params.quality });
}
outputBuffer = await pipeline.toBuffer();
} else {
// No transforms - serve original (but maybe convert format)
if (params.format !== 'webp' || params.quality !== 80) {
let pipeline = sharp(imageBuffer);
switch (params.format) {
case 'jpeg':
case 'jpg':
pipeline = pipeline.jpeg({ quality: params.quality });
break;
case 'png':
pipeline = pipeline.png({ quality: params.quality });
break;
case 'avif':
pipeline = pipeline.avif({ quality: params.quality });
break;
case 'webp':
default:
pipeline = pipeline.webp({ quality: params.quality });
}
outputBuffer = await pipeline.toBuffer();
} else {
outputBuffer = imageBuffer;
}
}
// Set headers
res.setHeader('Content-Type', getContentType(params.format));
res.setHeader('Cache-Control', `public, max-age=${CACHE_MAX_AGE}, immutable`);
res.setHeader('X-Image-Size', outputBuffer.length);
// Send image
res.send(outputBuffer);
} catch (error: any) {
console.error('[ImageProxy] Error:', error.message);
res.status(500).json({ error: 'Failed to process image' });
}
});
export default router;

View File

@@ -8,11 +8,13 @@ const router = Router();
*/
router.get('/', async (req: Request, res: Response) => {
try {
const gitSha = process.env.APP_GIT_SHA || 'unknown';
const versionInfo = {
build_version: process.env.APP_BUILD_VERSION || 'dev',
git_sha: process.env.APP_GIT_SHA || 'local',
build_version: process.env.APP_BUILD_VERSION?.slice(0, 8) || 'dev',
git_sha: gitSha.slice(0, 8) || 'unknown',
git_sha_full: gitSha,
build_time: process.env.APP_BUILD_TIME || new Date().toISOString(),
image_tag: process.env.CONTAINER_IMAGE_TAG || 'local',
image_tag: process.env.CONTAINER_IMAGE_TAG?.slice(0, 8) || 'local',
};
res.json(versionInfo);

View File

@@ -0,0 +1,250 @@
#!/usr/bin/env npx tsx
/**
* Crawl Single Store - Verbose test showing each step
*
* Usage:
* DATABASE_URL="postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
* npx tsx src/scripts/crawl-single-store.ts <dispensaryId>
*
* Example:
* DATABASE_URL="..." npx tsx src/scripts/crawl-single-store.ts 112
*/
import { Pool } from 'pg';
import dotenv from 'dotenv';
import {
executeGraphQL,
startSession,
endSession,
getFingerprint,
GRAPHQL_HASHES,
DUTCHIE_CONFIG,
} from '../platforms/dutchie';
dotenv.config();
// ============================================================
// DATABASE CONNECTION
// ============================================================
function getConnectionString(): string {
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
const port = process.env.CANNAIQ_DB_PORT || '54320';
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
const pool = new Pool({ connectionString: getConnectionString() });
// ============================================================
// MAIN
// ============================================================
async function main() {
const dispensaryId = parseInt(process.argv[2], 10);
if (!dispensaryId) {
console.error('Usage: npx tsx src/scripts/crawl-single-store.ts <dispensaryId>');
console.error('Example: npx tsx src/scripts/crawl-single-store.ts 112');
process.exit(1);
}
console.log('');
console.log('╔════════════════════════════════════════════════════════════╗');
console.log('║ SINGLE STORE CRAWL - VERBOSE OUTPUT ║');
console.log('╚════════════════════════════════════════════════════════════╝');
console.log('');
try {
// ============================================================
// STEP 1: Get dispensary info from database
// ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 1: Load Dispensary Info from Database │');
console.log('└─────────────────────────────────────────────────────────────┘');
const dispResult = await pool.query(`
SELECT
id,
name,
platform_dispensary_id,
menu_url,
menu_type,
city,
state
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
throw new Error(`Dispensary ${dispensaryId} not found`);
}
const disp = dispResult.rows[0];
console.log(` Dispensary ID: ${disp.id}`);
console.log(` Name: ${disp.name}`);
console.log(` City, State: ${disp.city}, ${disp.state}`);
console.log(` Menu Type: ${disp.menu_type}`);
console.log(` Platform ID: ${disp.platform_dispensary_id}`);
console.log(` Menu URL: ${disp.menu_url}`);
if (!disp.platform_dispensary_id) {
throw new Error('Dispensary does not have a platform_dispensary_id - cannot crawl');
}
// Extract cName from menu_url
const cNameMatch = disp.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(` cName (derived): ${cName}`);
console.log('');
// ============================================================
// STEP 2: Start stealth session
// ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 2: Start Stealth Session │');
console.log('└─────────────────────────────────────────────────────────────┘');
// Use Arizona timezone for this store
const session = startSession(disp.state || 'AZ', 'America/Phoenix');
const fp = getFingerprint();
console.log(` Session ID: ${session.sessionId}`);
console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`);
console.log(` Accept-Language: ${fp.acceptLanguage}`);
console.log(` Sec-CH-UA: ${fp.secChUa || '(not set)'}`);
console.log('');
// ============================================================
// STEP 3: Execute GraphQL query
// ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 3: Execute GraphQL Query (FilteredProducts) │');
console.log('└─────────────────────────────────────────────────────────────┘');
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: disp.platform_dispensary_id,
pricingType: 'rec',
Status: 'Active',
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: 0,
perPage: 100,
};
console.log(` Endpoint: ${DUTCHIE_CONFIG.graphqlEndpoint}`);
console.log(` Operation: FilteredProducts`);
console.log(` Hash: ${GRAPHQL_HASHES.FilteredProducts.slice(0, 20)}...`);
console.log(` dispensaryId: ${variables.productsFilter.dispensaryId}`);
console.log(` pricingType: ${variables.productsFilter.pricingType}`);
console.log(` Status: ${variables.productsFilter.Status}`);
console.log(` perPage: ${variables.perPage}`);
console.log('');
console.log(' Sending request...');
const startTime = Date.now();
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const elapsed = Date.now() - startTime;
console.log(` Response time: ${elapsed}ms`);
console.log('');
// ============================================================
// STEP 4: Process response
// ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 4: Process Response │');
console.log('└─────────────────────────────────────────────────────────────┘');
const data = result?.data?.filteredProducts;
if (!data) {
console.log(' ERROR: No data returned from GraphQL');
console.log(' Raw result:', JSON.stringify(result, null, 2).slice(0, 500));
endSession();
return;
}
const products = data.products || [];
const totalCount = data.queryInfo?.totalCount || 0;
const totalPages = Math.ceil(totalCount / 100);
console.log(` Total products: ${totalCount}`);
console.log(` Products in page: ${products.length}`);
console.log(` Total pages: ${totalPages}`);
console.log('');
// Show first few products
console.log(' First 5 products:');
console.log(' ─────────────────────────────────────────────────────────');
for (let i = 0; i < Math.min(5, products.length); i++) {
const p = products[i];
const name = (p.name || 'Unknown').slice(0, 40);
const brand = (p.brand?.name || 'Unknown').slice(0, 15);
const price = p.Prices?.[0]?.price || p.medPrice || p.recPrice || 'N/A';
const category = p.type || p.category || 'N/A';
console.log(` ${i + 1}. ${name.padEnd(42)} | ${brand.padEnd(17)} | $${price}`);
}
console.log('');
// ============================================================
// STEP 5: End session
// ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 5: End Session │');
console.log('└─────────────────────────────────────────────────────────────┘');
endSession();
console.log('');
// ============================================================
// SUMMARY
// ============================================================
console.log('╔════════════════════════════════════════════════════════════╗');
console.log('║ SUMMARY ║');
console.log('╠════════════════════════════════════════════════════════════╣');
console.log(`║ Store: ${disp.name.slice(0, 38).padEnd(38)}`);
console.log(`║ Products Found: ${String(totalCount).padEnd(38)}`);
console.log(`║ Response Time: ${(elapsed + 'ms').padEnd(38)}`);
console.log(`║ Status: ${'SUCCESS'.padEnd(38)}`);
console.log('╚════════════════════════════════════════════════════════════╝');
} catch (error: any) {
console.error('');
console.error('╔════════════════════════════════════════════════════════════╗');
console.error('║ ERROR ║');
console.error('╚════════════════════════════════════════════════════════════╝');
console.error(` ${error.message}`);
if (error.stack) {
console.error('');
console.error('Stack trace:');
console.error(error.stack.split('\n').slice(0, 5).join('\n'));
}
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -23,6 +23,7 @@ import {
DutchieNormalizer,
hydrateToCanonical,
} from '../hydration';
import { initializeImageStorage } from '../utils/image-storage';
dotenv.config();
@@ -137,6 +138,11 @@ async function main() {
console.log(`Test Crawl to Canonical - Dispensary ${dispensaryId}`);
console.log('============================================================\n');
// Initialize image storage
console.log('[Init] Initializing image storage...');
await initializeImageStorage();
console.log(' Image storage ready\n');
try {
// Step 1: Get dispensary info
console.log('[Step 1] Getting dispensary info...');

View File

@@ -0,0 +1,268 @@
#!/usr/bin/env npx tsx
/**
* Test Image Download - Tests image downloading with a small batch of products
*
* Usage:
* DATABASE_URL="postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
* STORAGE_DRIVER=local STORAGE_BASE_PATH=./storage \
* npx tsx src/scripts/test-image-download.ts <dispensaryId> [limit]
*
* Example:
* DATABASE_URL="..." npx tsx src/scripts/test-image-download.ts 112 5
*/
import { Pool } from 'pg';
import dotenv from 'dotenv';
import {
executeGraphQL,
startSession,
endSession,
GRAPHQL_HASHES,
} from '../platforms/dutchie';
import { DutchieNormalizer } from '../hydration/normalizers/dutchie';
import { hydrateToCanonical } from '../hydration/canonical-upsert';
import { initializeImageStorage, getStorageStats } from '../utils/image-storage';
dotenv.config();
// ============================================================
// DATABASE CONNECTION
// ============================================================
function getConnectionString(): string {
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
const port = process.env.CANNAIQ_DB_PORT || '54320';
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
const pool = new Pool({ connectionString: getConnectionString() });
// ============================================================
// MAIN
// ============================================================
async function main() {
const dispensaryId = parseInt(process.argv[2], 10);
const limit = parseInt(process.argv[3], 10) || 5;
if (!dispensaryId) {
console.error('Usage: npx tsx src/scripts/test-image-download.ts <dispensaryId> [limit]');
console.error('Example: npx tsx src/scripts/test-image-download.ts 112 5');
process.exit(1);
}
console.log('');
console.log('╔════════════════════════════════════════════════════════════╗');
console.log('║ IMAGE DOWNLOAD TEST ║');
console.log('╚════════════════════════════════════════════════════════════╝');
console.log('');
try {
// Initialize image storage
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 1: Initialize Image Storage │');
console.log('└─────────────────────────────────────────────────────────────┘');
await initializeImageStorage();
const statsBefore = await getStorageStats();
console.log(` Base path: ${statsBefore.basePath}`);
console.log(` Products before: ${statsBefore.productCount}`);
console.log(` Brands before: ${statsBefore.brandCount}`);
console.log('');
// Get dispensary info
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 2: Load Dispensary Info │');
console.log('└─────────────────────────────────────────────────────────────┘');
const dispResult = await pool.query(`
SELECT
id, name, platform_dispensary_id, menu_url, state, slug
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
throw new Error(`Dispensary ${dispensaryId} not found`);
}
const disp = dispResult.rows[0];
console.log(` Dispensary: ${disp.name}`);
console.log(` State: ${disp.state}`);
console.log(` Slug: ${disp.slug}`);
console.log(` Platform ID: ${disp.platform_dispensary_id}`);
console.log('');
// Delete some existing store_products to force "new" products
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 3: Clear Store Products (to test new product flow) │');
console.log('└─────────────────────────────────────────────────────────────┘');
const deleteResult = await pool.query(`
DELETE FROM store_products
WHERE dispensary_id = $1
RETURNING id
`, [dispensaryId]);
console.log(` Deleted ${deleteResult.rowCount} existing store_products`);
console.log('');
// Fetch products from Dutchie
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 4: Fetch Products from Dutchie (limited) │');
console.log('└─────────────────────────────────────────────────────────────┘');
const cNameMatch = disp.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
const session = startSession(disp.state || 'AZ', 'America/Phoenix');
console.log(` Session ID: ${session.sessionId}`);
console.log(` cName: ${cName}`);
console.log(` Limit: ${limit} products`);
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: disp.platform_dispensary_id,
pricingType: 'rec',
Status: 'Active',
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: 0,
perPage: limit, // Only fetch limited products
};
const startTime = Date.now();
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const elapsed = Date.now() - startTime;
endSession();
const products = result?.data?.filteredProducts?.products || [];
console.log(` Fetched: ${products.length} products in ${elapsed}ms`);
// Show products with images
console.log('');
console.log(' Products with images:');
for (let i = 0; i < products.length; i++) {
const p = products[i];
const hasImage = !!p.Image;
const brandName = p.brand?.name || 'Unknown';
console.log(` ${i + 1}. ${p.name?.slice(0, 40).padEnd(42)} | ${brandName.slice(0, 15).padEnd(17)} | ${hasImage ? '✓ has image' : '✗ no image'}`);
}
console.log('');
// Normalize and hydrate
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 5: Normalize and Hydrate (with image download) │');
console.log('└─────────────────────────────────────────────────────────────┘');
const normalizer = new DutchieNormalizer();
// Wrap products in expected payload format
const payload = {
raw_json: products, // DutchieNormalizer.extractProducts handles arrays
dispensary_id: dispensaryId,
};
const normResult = normalizer.normalize(payload);
console.log(` Normalized products: ${normResult.products.length}`);
console.log(` Brands found: ${normResult.brands.length}`);
const hydrateStart = Date.now();
const hydrateResult = await hydrateToCanonical(
pool,
dispensaryId,
normResult,
null, // no crawl run ID for test
{ dryRun: false, downloadImages: true }
);
const hydrateElapsed = Date.now() - hydrateStart;
console.log('');
console.log(` Hydration time: ${hydrateElapsed}ms`);
console.log(` Products new: ${hydrateResult.productsNew}`);
console.log(` Products updated: ${hydrateResult.productsUpdated}`);
console.log(` Images downloaded: ${hydrateResult.imagesDownloaded}`);
console.log(` Images skipped: ${hydrateResult.imagesSkipped}`);
console.log(` Images failed: ${hydrateResult.imagesFailed}`);
console.log(` Image bytes: ${(hydrateResult.imagesBytesTotal / 1024).toFixed(1)} KB`);
console.log('');
// Check storage stats
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 6: Verify Storage │');
console.log('└─────────────────────────────────────────────────────────────┘');
const statsAfter = await getStorageStats();
console.log(` Products after: ${statsAfter.productCount}`);
console.log(` Brands after: ${statsAfter.brandCount}`);
console.log(` Total size: ${(statsAfter.totalSizeBytes / 1024).toFixed(1)} KB`);
console.log('');
// Check database for local_image_path
console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 7: Check Database for Local Image Paths │');
console.log('└─────────────────────────────────────────────────────────────┘');
const dbCheck = await pool.query(`
SELECT
id, name_raw, local_image_path, images
FROM store_products
WHERE dispensary_id = $1
LIMIT 10
`, [dispensaryId]);
for (const row of dbCheck.rows) {
const hasLocal = !!row.local_image_path;
const hasImages = !!row.images;
console.log(` ${row.id}: ${row.name_raw?.slice(0, 40).padEnd(42)} | local: ${hasLocal ? '✓' : '✗'} | images: ${hasImages ? '✓' : '✗'}`);
if (row.local_image_path) {
console.log(`${row.local_image_path}`);
}
}
console.log('');
// Summary
console.log('╔════════════════════════════════════════════════════════════╗');
console.log('║ SUMMARY ║');
console.log('╠════════════════════════════════════════════════════════════╣');
console.log(`║ Dispensary: ${disp.name.slice(0, 37).padEnd(37)}`);
console.log(`║ Products crawled: ${String(products.length).padEnd(37)}`);
console.log(`║ Images downloaded: ${String(hydrateResult.imagesDownloaded).padEnd(37)}`);
console.log(`║ Total image bytes: ${((hydrateResult.imagesBytesTotal / 1024).toFixed(1) + ' KB').padEnd(37)}`);
console.log(`║ Status: ${'SUCCESS'.padEnd(37)}`);
console.log('╚════════════════════════════════════════════════════════════╝');
} catch (error: any) {
console.error('');
console.error('╔════════════════════════════════════════════════════════════╗');
console.error('║ ERROR ║');
console.error('╚════════════════════════════════════════════════════════════╝');
console.error(` ${error.message}`);
if (error.stack) {
console.error('');
console.error('Stack trace:');
console.error(error.stack.split('\n').slice(0, 5).join('\n'));
}
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env npx tsx
/**
* Test Image Proxy - Standalone test without backend
*
* Usage:
* npx tsx src/scripts/test-image-proxy.ts
*/
import express from 'express';
import imageProxyRoutes from '../routes/image-proxy';
const app = express();
const PORT = 3099;
// Mount the image proxy
app.use('/img', imageProxyRoutes);
// Start server
app.listen(PORT, async () => {
console.log(`Test image proxy running on http://localhost:${PORT}`);
console.log('');
console.log('Testing image proxy...');
console.log('');
const axios = require('axios');
// Test cases
const tests = [
{
name: 'Original image',
url: '/img/products/az/az-deeply-rooted/clout-king/68b4b20a0f9ef3e90eb51e96/image-268a6e44.webp',
},
{
name: 'Resize to 200px width',
url: '/img/products/az/az-deeply-rooted/clout-king/68b4b20a0f9ef3e90eb51e96/image-268a6e44.webp?w=200',
},
{
name: 'Resize to 100x100 cover',
url: '/img/products/az/az-deeply-rooted/clout-king/68b4b20a0f9ef3e90eb51e96/image-268a6e44.webp?w=100&h=100&fit=cover',
},
{
name: 'Grayscale + blur',
url: '/img/products/az/az-deeply-rooted/clout-king/68b4b20a0f9ef3e90eb51e96/image-268a6e44.webp?w=200&gray=1&blur=2',
},
{
name: 'Convert to JPEG',
url: '/img/products/az/az-deeply-rooted/clout-king/68b4b20a0f9ef3e90eb51e96/image-268a6e44.webp?w=200&format=jpeg&q=70',
},
{
name: 'Non-existent image',
url: '/img/products/az/nonexistent/image.webp',
},
];
for (const test of tests) {
try {
const response = await axios.get(`http://localhost:${PORT}${test.url}`, {
responseType: 'arraybuffer',
validateStatus: () => true,
});
const contentType = response.headers['content-type'];
const size = response.data.length;
const status = response.status;
console.log(`${test.name}:`);
console.log(` URL: ${test.url.slice(0, 80)}${test.url.length > 80 ? '...' : ''}`);
console.log(` Status: ${status}`);
console.log(` Content-Type: ${contentType}`);
console.log(` Size: ${(size / 1024).toFixed(1)} KB`);
console.log('');
} catch (error: any) {
console.log(`${test.name}: ERROR - ${error.message}`);
console.log('');
}
}
console.log('Tests complete!');
process.exit(0);
});

View File

@@ -0,0 +1,117 @@
/**
* Test script for stealth session management
*
* Tests:
* 1. Per-session fingerprint rotation
* 2. Geographic consistency (timezone → Accept-Language)
* 3. Proxy location loading from database
*
* Usage:
* npx tsx src/scripts/test-stealth-session.ts
*/
import {
startSession,
endSession,
getCurrentSession,
getFingerprint,
getRandomFingerprint,
getLocaleForTimezone,
buildHeaders,
} from '../platforms/dutchie';
console.log('='.repeat(60));
console.log('STEALTH SESSION TEST');
console.log('='.repeat(60));
// Test 1: Timezone to Locale mapping
console.log('\n[Test 1] Timezone to Locale Mapping:');
const testTimezones = [
'America/Phoenix',
'America/Los_Angeles',
'America/New_York',
'America/Chicago',
undefined,
'Invalid/Timezone',
];
for (const tz of testTimezones) {
const locale = getLocaleForTimezone(tz);
console.log(` ${tz || '(undefined)'}${locale}`);
}
// Test 2: Random fingerprint selection
console.log('\n[Test 2] Random Fingerprint Selection (5 samples):');
for (let i = 0; i < 5; i++) {
const fp = getRandomFingerprint();
console.log(` ${i + 1}. ${fp.userAgent.slice(0, 60)}...`);
}
// Test 3: Session Management
console.log('\n[Test 3] Session Management:');
// Before session - should use default fingerprint
console.log(' Before session:');
const beforeFp = getFingerprint();
console.log(` getFingerprint(): ${beforeFp.userAgent.slice(0, 50)}...`);
console.log(` getCurrentSession(): ${getCurrentSession()}`);
// Start session with Arizona timezone
console.log('\n Starting session (AZ, America/Phoenix):');
const session1 = startSession('AZ', 'America/Phoenix');
console.log(` Session ID: ${session1.sessionId}`);
console.log(` Fingerprint UA: ${session1.fingerprint.userAgent.slice(0, 50)}...`);
console.log(` Accept-Language: ${session1.fingerprint.acceptLanguage}`);
console.log(` Timezone: ${session1.timezone}`);
// During session - should use session fingerprint
console.log('\n During session:');
const duringFp = getFingerprint();
console.log(` getFingerprint(): ${duringFp.userAgent.slice(0, 50)}...`);
console.log(` Same as session? ${duringFp.userAgent === session1.fingerprint.userAgent}`);
// Test buildHeaders with session
console.log('\n buildHeaders() during session:');
const headers = buildHeaders('/embedded-menu/test-store');
console.log(` User-Agent: ${headers['user-agent'].slice(0, 50)}...`);
console.log(` Accept-Language: ${headers['accept-language']}`);
console.log(` Origin: ${headers['origin']}`);
console.log(` Referer: ${headers['referer']}`);
// End session
console.log('\n Ending session:');
endSession();
console.log(` getCurrentSession(): ${getCurrentSession()}`);
// Test 4: Multiple sessions should have different fingerprints
console.log('\n[Test 4] Multiple Sessions (fingerprint variety):');
const fingerprints: string[] = [];
for (let i = 0; i < 10; i++) {
const session = startSession('CA', 'America/Los_Angeles');
fingerprints.push(session.fingerprint.userAgent);
endSession();
}
const uniqueCount = new Set(fingerprints).size;
console.log(` 10 sessions created, ${uniqueCount} unique fingerprints`);
console.log(` Variety: ${uniqueCount >= 3 ? '✅ Good' : '⚠️ Low - may need more fingerprint options'}`);
// Test 5: Geographic consistency check
console.log('\n[Test 5] Geographic Consistency:');
const geoTests = [
{ state: 'AZ', tz: 'America/Phoenix' },
{ state: 'CA', tz: 'America/Los_Angeles' },
{ state: 'NY', tz: 'America/New_York' },
{ state: 'IL', tz: 'America/Chicago' },
];
for (const { state, tz } of geoTests) {
const session = startSession(state, tz);
const consistent = session.fingerprint.acceptLanguage.includes('en-US');
console.log(` ${state} (${tz}): Accept-Language=${session.fingerprint.acceptLanguage} ${consistent ? '✅' : '❌'}`);
endSession();
}
console.log('\n' + '='.repeat(60));
console.log('TEST COMPLETE');
console.log('='.repeat(60));

View File

@@ -0,0 +1,144 @@
/**
* Test script for stealth session with REAL proxy data from database
*
* Tests:
* 1. Load proxies from database (with location data)
* 2. Verify location fields (city, state, timezone) are loaded
* 3. Start session with proxy's timezone
* 4. Verify Accept-Language matches timezone
*
* Usage:
* DATABASE_URL="postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" npx tsx src/scripts/test-stealth-with-db.ts
*/
import { Pool } from 'pg';
import {
CrawlRotator,
ProxyRotator,
} from '../services/crawl-rotator';
import {
startSession,
endSession,
getLocaleForTimezone,
} from '../platforms/dutchie';
const DATABASE_URL = process.env.DATABASE_URL;
if (!DATABASE_URL) {
console.error('ERROR: DATABASE_URL environment variable is required');
process.exit(1);
}
async function main() {
console.log('='.repeat(60));
console.log('STEALTH SESSION TEST WITH DATABASE');
console.log('='.repeat(60));
const pool = new Pool({ connectionString: DATABASE_URL });
try {
// Test 1: Load proxies with location data
console.log('\n[Test 1] Loading proxies from database...');
const rotator = new CrawlRotator(pool);
await rotator.initialize();
const stats = rotator.proxy.getStats();
console.log(` Total proxies: ${stats.totalProxies}`);
console.log(` Active proxies: ${stats.activeProxies}`);
if (stats.activeProxies === 0) {
console.log('\n WARNING: No active proxies in database!');
console.log(' Insert test proxies with:');
console.log(` INSERT INTO proxies (host, port, protocol, city, state, country_code, timezone, active)`);
console.log(` VALUES ('proxy1.example.com', 8080, 'http', 'Phoenix', 'AZ', 'US', 'America/Phoenix', true);`);
return;
}
// Test 2: Check location data on proxies
console.log('\n[Test 2] Checking proxy location data...');
let proxyCount = 0;
let withLocationCount = 0;
// Iterate through proxies
for (let i = 0; i < stats.totalProxies; i++) {
const proxy = rotator.proxy.getNext();
if (!proxy) break;
proxyCount++;
const hasLocation = !!(proxy.stateCode || proxy.timezone);
if (hasLocation) withLocationCount++;
console.log(` Proxy ${proxy.id}: ${proxy.host}:${proxy.port}`);
console.log(` City: ${proxy.city || '(not set)'}`);
console.log(` State: ${proxy.stateCode || '(not set)'}`);
console.log(` Country: ${proxy.countryCode || '(not set)'}`);
console.log(` Timezone: ${proxy.timezone || '(not set)'}`);
console.log(` Has location data: ${hasLocation ? '✅' : '❌'}`);
}
console.log(`\n Summary: ${withLocationCount}/${proxyCount} proxies have location data`);
// Test 3: Start session using proxy's timezone
console.log('\n[Test 3] Starting session with proxy timezone...');
// Get first proxy with timezone
const firstProxy = rotator.proxy.getNext();
if (firstProxy && firstProxy.timezone) {
console.log(` Using proxy: ${firstProxy.host} (${firstProxy.city}, ${firstProxy.stateCode})`);
console.log(` Proxy timezone: ${firstProxy.timezone}`);
const session = startSession(firstProxy.stateCode, firstProxy.timezone);
console.log(` Session ID: ${session.sessionId}`);
console.log(` Session timezone: ${session.timezone}`);
console.log(` Session Accept-Language: ${session.fingerprint.acceptLanguage}`);
// Verify Accept-Language matches expected locale for timezone
const expectedLocale = getLocaleForTimezone(firstProxy.timezone);
const matches = session.fingerprint.acceptLanguage === expectedLocale;
console.log(` Expected locale: ${expectedLocale}`);
console.log(` Locale matches: ${matches ? '✅' : '❌'}`);
endSession();
} else {
console.log(' WARNING: No proxy with timezone data found');
}
// Test 4: Test each timezone in database
console.log('\n[Test 4] Testing all proxy timezones...');
const seenTimezones = new Set<string>();
// Reset to beginning
for (let i = 0; i < stats.totalProxies; i++) {
const proxy = rotator.proxy.getNext();
if (!proxy || !proxy.timezone) continue;
if (seenTimezones.has(proxy.timezone)) continue;
seenTimezones.add(proxy.timezone);
const session = startSession(proxy.stateCode, proxy.timezone);
console.log(` ${proxy.timezone}:`);
console.log(` State: ${proxy.stateCode || 'unknown'}`);
console.log(` Accept-Language: ${session.fingerprint.acceptLanguage}`);
endSession();
}
console.log('\n' + '='.repeat(60));
console.log('TEST COMPLETE');
console.log('='.repeat(60));
if (withLocationCount === 0) {
console.log('\n⚠ No proxies have location data.');
console.log(' Geographic consistency will use default locale (en-US).');
console.log(' To enable geo-consistency, populate city/state/timezone on proxies.');
} else {
console.log('\n✅ Stealth session with geo-consistency is working!');
console.log(' Sessions will use Accept-Language matching proxy timezone.');
}
} catch (error) {
console.error('Error:', error);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,26 +1,29 @@
/**
* Local Image Storage Utility
*
* Downloads and stores product images to local filesystem.
* Replaces MinIO-based storage with simple local file storage.
* Downloads and stores product images to local filesystem with proper hierarchy.
*
* Directory structure:
* /images/products/<dispensary_id>/<product_id>.webp
* /images/products/<dispensary_id>/<product_id>-thumb.webp
* /images/products/<dispensary_id>/<product_id>-medium.webp
* /images/brands/<brand_slug>.webp
* /images/products/<state>/<store_slug>/<brand_slug>/<product_id>/image.webp
* /images/products/<state>/<store_slug>/<brand_slug>/<product_id>/image-medium.webp
* /images/products/<state>/<store_slug>/<brand_slug>/<product_id>/image-thumb.webp
* /images/brands/<brand_slug>/logo.webp
*
* This structure allows:
* - Easy migration to MinIO/S3 (bucket per state)
* - Browsing by state/store/brand
* - Multiple images per product (future: gallery)
*/
import axios from 'axios';
import sharp from 'sharp';
// @ts-ignore - sharp module typing quirk
const sharp = require('sharp');
import * as fs from 'fs/promises';
import * as path from 'path';
import { createHash } from 'crypto';
// Base path for image storage - configurable via env
// Uses project-relative paths by default, NOT /app or other privileged paths
function getImagesBasePath(): string {
// Priority: IMAGES_PATH > STORAGE_BASE_PATH/images > ./storage/images
if (process.env.IMAGES_PATH) {
return process.env.IMAGES_PATH;
}
@@ -35,16 +38,28 @@ const IMAGES_BASE_PATH = getImagesBasePath();
const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images';
export interface LocalImageSizes {
full: string; // URL path: /images/products/123/456.webp
medium: string; // URL path: /images/products/123/456-medium.webp
thumb: string; // URL path: /images/products/123/456-thumb.webp
original: string; // URL path to original image
// Legacy compatibility - all point to original until we add image proxy
full: string;
medium: string;
thumb: string;
}
export interface DownloadResult {
success: boolean;
urls?: LocalImageSizes;
localPaths?: LocalImageSizes;
error?: string;
bytesDownloaded?: number;
skipped?: boolean; // True if image already exists
}
export interface ProductImageContext {
stateCode: string; // e.g., "AZ", "CA"
storeSlug: string; // e.g., "deeply-rooted"
brandSlug: string; // e.g., "high-west-farms"
productId: string; // External product ID
dispensaryId?: number; // For backwards compat
}
/**
@@ -58,6 +73,17 @@ async function ensureDir(dirPath: string): Promise<void> {
}
}
/**
* Sanitize a string for use in file paths
*/
function slugify(str: string): string {
return str
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 50) || 'unknown';
}
/**
* Generate a short hash from a URL for deduplication
*/
@@ -81,53 +107,30 @@ async function downloadImage(imageUrl: string): Promise<Buffer> {
}
/**
* Process and save image in multiple sizes
* Returns the file paths relative to IMAGES_BASE_PATH
* Process and save original image (convert to webp for consistency)
*
* We store only the original - resizing will be done on-demand via
* an image proxy service (imgproxy, thumbor, or similar) in the future.
*/
async function processAndSaveImage(
buffer: Buffer,
outputDir: string,
baseFilename: string
): Promise<{ full: string; medium: string; thumb: string; totalBytes: number }> {
): Promise<{ original: string; totalBytes: number }> {
await ensureDir(outputDir);
const fullPath = path.join(outputDir, `${baseFilename}.webp`);
const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`);
const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`);
const originalPath = path.join(outputDir, `${baseFilename}.webp`);
// Process images in parallel
const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([
// Full: max 1200x1200, high quality
sharp(buffer)
.resize(1200, 1200, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 85 })
.toBuffer(),
// Medium: 600x600
sharp(buffer)
.resize(600, 600, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 80 })
.toBuffer(),
// Thumb: 200x200
sharp(buffer)
.resize(200, 200, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 75 })
.toBuffer(),
]);
// Convert to webp, preserve original dimensions, high quality
const originalBuffer = await sharp(buffer)
.webp({ quality: 90 })
.toBuffer();
// Save all sizes
await Promise.all([
fs.writeFile(fullPath, fullBuffer),
fs.writeFile(mediumPath, mediumBuffer),
fs.writeFile(thumbPath, thumbBuffer),
]);
const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length;
await fs.writeFile(originalPath, originalBuffer);
return {
full: fullPath,
medium: mediumPath,
thumb: thumbPath,
totalBytes,
original: originalPath,
totalBytes: originalBuffer.length,
};
}
@@ -135,47 +138,107 @@ async function processAndSaveImage(
* Convert a file path to a public URL
*/
function pathToUrl(filePath: string): string {
// Find /products/ or /brands/ in the path and extract from there
const productsMatch = filePath.match(/(\/products\/.*)/);
const brandsMatch = filePath.match(/(\/brands\/.*)/);
if (productsMatch) {
return `${IMAGES_PUBLIC_URL}${productsMatch[1]}`;
}
if (brandsMatch) {
return `${IMAGES_PUBLIC_URL}${brandsMatch[1]}`;
}
// Fallback: try to replace base path (works if paths match exactly)
const relativePath = filePath.replace(IMAGES_BASE_PATH, '');
return `${IMAGES_PUBLIC_URL}${relativePath}`;
}
/**
* Download and store a product image locally
* Build the directory path for a product image
* Structure: /images/products/<state>/<store>/<brand>/<product>/
*/
function buildProductImagePath(ctx: ProductImageContext): string {
const state = slugify(ctx.stateCode || 'unknown');
const store = slugify(ctx.storeSlug || 'unknown');
const brand = slugify(ctx.brandSlug || 'unknown');
const product = slugify(ctx.productId || 'unknown');
return path.join(IMAGES_BASE_PATH, 'products', state, store, brand, product);
}
/**
* Download and store a product image with proper hierarchy
*
* @param imageUrl - The third-party image URL to download
* @param dispensaryId - The dispensary ID (for directory organization)
* @param productId - The product ID or external ID (for filename)
* @param ctx - Product context (state, store, brand, product)
* @param options - Download options
* @returns Download result with local URLs
*/
export async function downloadProductImage(
imageUrl: string,
dispensaryId: number,
productId: string | number
ctx: ProductImageContext,
options: { skipIfExists?: boolean } = {}
): Promise<DownloadResult> {
const { skipIfExists = true } = options;
try {
if (!imageUrl) {
return { success: false, error: 'No image URL provided' };
}
const outputDir = buildProductImagePath(ctx);
const urlHash = hashUrl(imageUrl);
const baseFilename = `image-${urlHash}`;
// Check if image already exists
if (skipIfExists) {
const existingPath = path.join(outputDir, `${baseFilename}.webp`);
try {
await fs.access(existingPath);
// Image exists, return existing URL
const url = pathToUrl(existingPath);
return {
success: true,
skipped: true,
urls: {
original: url,
full: url,
medium: url,
thumb: url,
},
localPaths: {
original: existingPath,
full: existingPath,
medium: existingPath,
thumb: existingPath,
},
};
} catch {
// Image doesn't exist, continue to download
}
}
// Download the image
const buffer = await downloadImage(imageUrl);
// Organize by dispensary ID
const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
// Use product ID + URL hash for uniqueness
const urlHash = hashUrl(imageUrl);
const baseFilename = `${productId}-${urlHash}`;
// Process and save
// Process and save (original only)
const result = await processAndSaveImage(buffer, outputDir, baseFilename);
const url = pathToUrl(result.original);
return {
success: true,
urls: {
full: pathToUrl(result.full),
medium: pathToUrl(result.medium),
thumb: pathToUrl(result.thumb),
original: url,
full: url,
medium: url,
thumb: url,
},
localPaths: {
original: result.original,
full: result.original,
medium: result.original,
thumb: result.original,
},
bytesDownloaded: result.totalBytes,
};
@@ -188,33 +251,70 @@ export async function downloadProductImage(
}
/**
* Download and store a brand logo locally
* Legacy function - backwards compatible with old signature
* Maps to new hierarchy using dispensary_id as store identifier
*/
export async function downloadProductImageLegacy(
imageUrl: string,
dispensaryId: number,
productId: string | number
): Promise<DownloadResult> {
return downloadProductImage(imageUrl, {
stateCode: 'unknown',
storeSlug: `store-${dispensaryId}`,
brandSlug: 'unknown',
productId: String(productId),
dispensaryId,
});
}
/**
* Download and store a brand logo
*
* @param logoUrl - The brand logo URL
* @param brandId - The brand ID or slug
* @param brandSlug - The brand slug/ID
* @returns Download result with local URL
*/
export async function downloadBrandLogo(
logoUrl: string,
brandId: string
brandSlug: string,
options: { skipIfExists?: boolean } = {}
): Promise<DownloadResult> {
const { skipIfExists = true } = options;
try {
if (!logoUrl) {
return { success: false, error: 'No logo URL provided' };
}
const safeBrandSlug = slugify(brandSlug);
const outputDir = path.join(IMAGES_BASE_PATH, 'brands', safeBrandSlug);
const urlHash = hashUrl(logoUrl);
const baseFilename = `logo-${urlHash}`;
// Check if logo already exists
if (skipIfExists) {
const existingPath = path.join(outputDir, `${baseFilename}.webp`);
try {
await fs.access(existingPath);
return {
success: true,
skipped: true,
urls: {
full: pathToUrl(existingPath),
medium: pathToUrl(existingPath),
thumb: pathToUrl(existingPath),
},
};
} catch {
// Logo doesn't exist, continue
}
}
// Download the image
const buffer = await downloadImage(logoUrl);
// Brand logos go in /images/brands/
const outputDir = path.join(IMAGES_BASE_PATH, 'brands');
// Sanitize brand ID for filename
const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_');
const urlHash = hashUrl(logoUrl);
const baseFilename = `${safeBrandId}-${urlHash}`;
// Process and save (single size for logos)
// Brand logos in their own directory
await ensureDir(outputDir);
const logoPath = path.join(outputDir, `${baseFilename}.webp`);
@@ -243,20 +343,16 @@ export async function downloadBrandLogo(
}
/**
* Check if a local image already exists
* Check if a product image already exists
*/
export async function imageExists(
dispensaryId: number,
productId: string | number,
export async function productImageExists(
ctx: ProductImageContext,
imageUrl: string
): Promise<boolean> {
const outputDir = buildProductImagePath(ctx);
const urlHash = hashUrl(imageUrl);
const imagePath = path.join(
IMAGES_BASE_PATH,
'products',
String(dispensaryId),
`${productId}-${urlHash}.webp`
);
const imagePath = path.join(outputDir, `image-${urlHash}.webp`);
try {
await fs.access(imagePath);
return true;
@@ -266,24 +362,27 @@ export async function imageExists(
}
/**
* Delete a product's local images
* Get the local image URL for a product (if exists)
*/
export async function deleteProductImages(
dispensaryId: number,
productId: string | number,
imageUrl?: string
): Promise<void> {
const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
const prefix = imageUrl
? `${productId}-${hashUrl(imageUrl)}`
: String(productId);
export async function getProductImageUrl(
ctx: ProductImageContext,
imageUrl: string
): Promise<LocalImageSizes | null> {
const outputDir = buildProductImagePath(ctx);
const urlHash = hashUrl(imageUrl);
const imagePath = path.join(outputDir, `image-${urlHash}.webp`);
try {
const files = await fs.readdir(productDir);
const toDelete = files.filter(f => f.startsWith(prefix));
await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f))));
await fs.access(imagePath);
const url = pathToUrl(imagePath);
return {
original: url,
full: url,
medium: url,
thumb: url,
};
} catch {
// Directory might not exist, that's fine
return null;
}
}
@@ -296,19 +395,17 @@ export function isImageStorageReady(): boolean {
/**
* Initialize the image storage directories
* Does NOT throw on failure - logs warning and continues
*/
export async function initializeImageStorage(): Promise<void> {
try {
await ensureDir(path.join(IMAGES_BASE_PATH, 'products'));
await ensureDir(path.join(IMAGES_BASE_PATH, 'brands'));
console.log(`Image storage initialized at ${IMAGES_BASE_PATH}`);
console.log(`[ImageStorage] Initialized at ${IMAGES_BASE_PATH}`);
imageStorageReady = true;
} catch (error: any) {
console.warn(`⚠️ WARNING: Could not initialize image storage at ${IMAGES_BASE_PATH}: ${error.message}`);
console.warn(' Image upload/processing is disabled. Server will continue without image features.');
console.warn(`[ImageStorage] WARNING: Could not initialize at ${IMAGES_BASE_PATH}: ${error.message}`);
console.warn(' Image features disabled. Server will continue without image downloads.');
imageStorageReady = false;
// Do NOT throw - server should still start
}
}
@@ -316,34 +413,43 @@ export async function initializeImageStorage(): Promise<void> {
* Get storage stats
*/
export async function getStorageStats(): Promise<{
productsDir: string;
brandsDir: string;
basePath: string;
productCount: number;
brandCount: number;
totalSizeBytes: number;
}> {
const productsDir = path.join(IMAGES_BASE_PATH, 'products');
const brandsDir = path.join(IMAGES_BASE_PATH, 'brands');
let productCount = 0;
let brandCount = 0;
let totalSizeBytes = 0;
try {
const productDirs = await fs.readdir(productsDir);
for (const dir of productDirs) {
const files = await fs.readdir(path.join(productsDir, dir));
productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length;
}
} catch { /* ignore */ }
async function countDir(dirPath: string): Promise<{ count: number; size: number }> {
let count = 0;
let size = 0;
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
if (entry.isDirectory()) {
const sub = await countDir(fullPath);
count += sub.count;
size += sub.size;
} else if (entry.name.endsWith('.webp') && !entry.name.includes('-')) {
count++;
const stat = await fs.stat(fullPath);
size += stat.size;
}
}
} catch { /* ignore */ }
return { count, size };
}
try {
const brandFiles = await fs.readdir(brandsDir);
brandCount = brandFiles.filter(f => f.endsWith('.webp')).length;
} catch { /* ignore */ }
const products = await countDir(path.join(IMAGES_BASE_PATH, 'products'));
const brands = await countDir(path.join(IMAGES_BASE_PATH, 'brands'));
return {
productsDir,
brandsDir,
productCount,
brandCount,
basePath: IMAGES_BASE_PATH,
productCount: products.count,
brandCount: brands.count,
totalSizeBytes: products.size + brands.size,
};
}