- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
364 lines
10 KiB
TypeScript
364 lines
10 KiB
TypeScript
/**
|
|
* Crawler Profiles Service
|
|
*
|
|
* Manages per-store crawler configuration profiles.
|
|
* This service handles CRUD operations for dispensary_crawler_profiles
|
|
* and provides helper functions for loading active profiles.
|
|
*
|
|
* Phase 1: Basic profile loading for Dutchie production crawls only.
|
|
*/
|
|
|
|
import { pool } from '../db/pool';
|
|
import {
|
|
DispensaryCrawlerProfile,
|
|
DispensaryCrawlerProfileCreate,
|
|
DispensaryCrawlerProfileUpdate,
|
|
CrawlerProfileOptions,
|
|
} from '../types';
|
|
|
|
// ============================================================
|
|
// Database Row Mapping
|
|
// ============================================================
|
|
|
|
/**
|
|
* Map database row (snake_case) to TypeScript interface (camelCase)
|
|
*/
|
|
function mapDbRowToProfile(row: any): DispensaryCrawlerProfile {
|
|
return {
|
|
id: row.id,
|
|
dispensaryId: row.dispensary_id,
|
|
profileName: row.profile_name,
|
|
crawlerType: row.crawler_type,
|
|
profileKey: row.profile_key,
|
|
config: row.config || {},
|
|
timeoutMs: row.timeout_ms,
|
|
downloadImages: row.download_images,
|
|
trackStock: row.track_stock,
|
|
version: row.version,
|
|
enabled: row.enabled,
|
|
createdAt: row.created_at,
|
|
updatedAt: row.updated_at,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// Profile Retrieval
|
|
// ============================================================
|
|
|
|
/**
|
|
* Get the active crawler profile for a dispensary.
|
|
*
|
|
* Resolution order:
|
|
* 1. If dispensaries.active_crawler_profile_id is set, load that profile (if enabled)
|
|
* 2. Otherwise, find the most recently created enabled profile matching the dispensary's
|
|
* menu_type (for Dutchie, crawler_type = 'dutchie')
|
|
* 3. Returns null if no matching profile exists
|
|
*
|
|
* @param dispensaryId - The dispensary ID to look up
|
|
* @param crawlerType - Optional: filter by crawler type (defaults to checking menu_type)
|
|
*/
|
|
export async function getActiveCrawlerProfileForDispensary(
|
|
dispensaryId: number,
|
|
crawlerType?: string
|
|
): Promise<DispensaryCrawlerProfile | null> {
|
|
// First, check if there's an explicit active_crawler_profile_id set
|
|
const activeProfileResult = await pool.query(
|
|
`SELECT dcp.*
|
|
FROM dispensary_crawler_profiles dcp
|
|
INNER JOIN dispensaries d ON d.active_crawler_profile_id = dcp.id
|
|
WHERE d.id = $1 AND dcp.enabled = true`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
if (activeProfileResult.rows.length > 0) {
|
|
return mapDbRowToProfile(activeProfileResult.rows[0]);
|
|
}
|
|
|
|
// No explicit active profile - fall back to most recent enabled profile
|
|
// If crawlerType not specified, try to match dispensary's menu_type
|
|
let effectiveCrawlerType = crawlerType;
|
|
if (!effectiveCrawlerType) {
|
|
const dispensaryResult = await pool.query(
|
|
`SELECT menu_type FROM dispensaries WHERE id = $1`,
|
|
[dispensaryId]
|
|
);
|
|
if (dispensaryResult.rows.length > 0 && dispensaryResult.rows[0].menu_type) {
|
|
effectiveCrawlerType = dispensaryResult.rows[0].menu_type;
|
|
}
|
|
}
|
|
|
|
// If we still don't have a crawler type, default to 'dutchie' for Phase 1
|
|
if (!effectiveCrawlerType) {
|
|
effectiveCrawlerType = 'dutchie';
|
|
}
|
|
|
|
const fallbackResult = await pool.query(
|
|
`SELECT * FROM dispensary_crawler_profiles
|
|
WHERE dispensary_id = $1
|
|
AND crawler_type = $2
|
|
AND enabled = true
|
|
ORDER BY created_at DESC
|
|
LIMIT 1`,
|
|
[dispensaryId, effectiveCrawlerType]
|
|
);
|
|
|
|
if (fallbackResult.rows.length > 0) {
|
|
return mapDbRowToProfile(fallbackResult.rows[0]);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Get all profiles for a dispensary
|
|
*/
|
|
export async function getProfilesForDispensary(
|
|
dispensaryId: number
|
|
): Promise<DispensaryCrawlerProfile[]> {
|
|
const result = await pool.query(
|
|
`SELECT * FROM dispensary_crawler_profiles
|
|
WHERE dispensary_id = $1
|
|
ORDER BY created_at DESC`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
return result.rows.map(mapDbRowToProfile);
|
|
}
|
|
|
|
/**
|
|
* Get a profile by ID
|
|
*/
|
|
export async function getProfileById(
|
|
profileId: number
|
|
): Promise<DispensaryCrawlerProfile | null> {
|
|
const result = await pool.query(
|
|
`SELECT * FROM dispensary_crawler_profiles WHERE id = $1`,
|
|
[profileId]
|
|
);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
return mapDbRowToProfile(result.rows[0]);
|
|
}
|
|
|
|
// ============================================================
|
|
// Profile Creation & Update
|
|
// ============================================================
|
|
|
|
/**
|
|
* Create a new crawler profile
|
|
*/
|
|
export async function createCrawlerProfile(
|
|
profile: DispensaryCrawlerProfileCreate
|
|
): Promise<DispensaryCrawlerProfile> {
|
|
const result = await pool.query(
|
|
`INSERT INTO dispensary_crawler_profiles (
|
|
dispensary_id, profile_name, crawler_type, profile_key,
|
|
config, timeout_ms, download_images, track_stock, version, enabled
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
|
RETURNING *`,
|
|
[
|
|
profile.dispensaryId,
|
|
profile.profileName,
|
|
profile.crawlerType,
|
|
profile.profileKey ?? null,
|
|
JSON.stringify(profile.config ?? {}),
|
|
profile.timeoutMs ?? 30000,
|
|
profile.downloadImages ?? true,
|
|
profile.trackStock ?? true,
|
|
profile.version ?? 1,
|
|
profile.enabled ?? true,
|
|
]
|
|
);
|
|
|
|
return mapDbRowToProfile(result.rows[0]);
|
|
}
|
|
|
|
/**
|
|
* Update an existing profile
|
|
*/
|
|
export async function updateCrawlerProfile(
|
|
profileId: number,
|
|
updates: DispensaryCrawlerProfileUpdate
|
|
): Promise<DispensaryCrawlerProfile | null> {
|
|
// Build dynamic update query
|
|
const setClauses: string[] = [];
|
|
const values: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (updates.profileName !== undefined) {
|
|
setClauses.push(`profile_name = $${paramIndex++}`);
|
|
values.push(updates.profileName);
|
|
}
|
|
if (updates.crawlerType !== undefined) {
|
|
setClauses.push(`crawler_type = $${paramIndex++}`);
|
|
values.push(updates.crawlerType);
|
|
}
|
|
if (updates.profileKey !== undefined) {
|
|
setClauses.push(`profile_key = $${paramIndex++}`);
|
|
values.push(updates.profileKey);
|
|
}
|
|
if (updates.config !== undefined) {
|
|
setClauses.push(`config = $${paramIndex++}`);
|
|
values.push(JSON.stringify(updates.config));
|
|
}
|
|
if (updates.timeoutMs !== undefined) {
|
|
setClauses.push(`timeout_ms = $${paramIndex++}`);
|
|
values.push(updates.timeoutMs);
|
|
}
|
|
if (updates.downloadImages !== undefined) {
|
|
setClauses.push(`download_images = $${paramIndex++}`);
|
|
values.push(updates.downloadImages);
|
|
}
|
|
if (updates.trackStock !== undefined) {
|
|
setClauses.push(`track_stock = $${paramIndex++}`);
|
|
values.push(updates.trackStock);
|
|
}
|
|
if (updates.version !== undefined) {
|
|
setClauses.push(`version = $${paramIndex++}`);
|
|
values.push(updates.version);
|
|
}
|
|
if (updates.enabled !== undefined) {
|
|
setClauses.push(`enabled = $${paramIndex++}`);
|
|
values.push(updates.enabled);
|
|
}
|
|
|
|
if (setClauses.length === 0) {
|
|
// Nothing to update
|
|
return getProfileById(profileId);
|
|
}
|
|
|
|
values.push(profileId);
|
|
|
|
const result = await pool.query(
|
|
`UPDATE dispensary_crawler_profiles
|
|
SET ${setClauses.join(', ')}
|
|
WHERE id = $${paramIndex}
|
|
RETURNING *`,
|
|
values
|
|
);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
return mapDbRowToProfile(result.rows[0]);
|
|
}
|
|
|
|
/**
|
|
* Delete a profile (hard delete - use updateCrawlerProfile with enabled=false for soft delete)
|
|
*/
|
|
export async function deleteCrawlerProfile(profileId: number): Promise<boolean> {
|
|
// First clear any active_crawler_profile_id references
|
|
await pool.query(
|
|
`UPDATE dispensaries SET active_crawler_profile_id = NULL
|
|
WHERE active_crawler_profile_id = $1`,
|
|
[profileId]
|
|
);
|
|
|
|
const result = await pool.query(
|
|
`DELETE FROM dispensary_crawler_profiles WHERE id = $1`,
|
|
[profileId]
|
|
);
|
|
|
|
return (result.rowCount ?? 0) > 0;
|
|
}
|
|
|
|
// ============================================================
|
|
// Active Profile Management
|
|
// ============================================================
|
|
|
|
/**
|
|
* Set the active crawler profile for a dispensary
|
|
*/
|
|
export async function setActiveCrawlerProfile(
|
|
dispensaryId: number,
|
|
profileId: number
|
|
): Promise<void> {
|
|
// Verify the profile belongs to this dispensary and is enabled
|
|
const profile = await getProfileById(profileId);
|
|
if (!profile) {
|
|
throw new Error(`Profile ${profileId} not found`);
|
|
}
|
|
if (profile.dispensaryId !== dispensaryId) {
|
|
throw new Error(`Profile ${profileId} does not belong to dispensary ${dispensaryId}`);
|
|
}
|
|
if (!profile.enabled) {
|
|
throw new Error(`Profile ${profileId} is not enabled`);
|
|
}
|
|
|
|
await pool.query(
|
|
`UPDATE dispensaries SET active_crawler_profile_id = $1 WHERE id = $2`,
|
|
[profileId, dispensaryId]
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Clear the active crawler profile for a dispensary
|
|
*/
|
|
export async function clearActiveCrawlerProfile(dispensaryId: number): Promise<void> {
|
|
await pool.query(
|
|
`UPDATE dispensaries SET active_crawler_profile_id = NULL WHERE id = $1`,
|
|
[dispensaryId]
|
|
);
|
|
}
|
|
|
|
// ============================================================
|
|
// Helper Functions
|
|
// ============================================================
|
|
|
|
/**
|
|
* Convert a profile to runtime options for the crawler
|
|
*/
|
|
export function profileToOptions(profile: DispensaryCrawlerProfile): CrawlerProfileOptions {
|
|
return {
|
|
timeoutMs: profile.timeoutMs ?? 30000,
|
|
downloadImages: profile.downloadImages,
|
|
trackStock: profile.trackStock,
|
|
config: profile.config,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get default options when no profile is configured
|
|
*/
|
|
export function getDefaultCrawlerOptions(): CrawlerProfileOptions {
|
|
return {
|
|
timeoutMs: 30000,
|
|
downloadImages: true,
|
|
trackStock: true,
|
|
config: {},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Check if a dispensary has any profiles
|
|
*/
|
|
export async function dispensaryHasProfiles(dispensaryId: number): Promise<boolean> {
|
|
const result = await pool.query(
|
|
`SELECT EXISTS(SELECT 1 FROM dispensary_crawler_profiles WHERE dispensary_id = $1) as has_profiles`,
|
|
[dispensaryId]
|
|
);
|
|
return result.rows[0]?.has_profiles ?? false;
|
|
}
|
|
|
|
/**
|
|
* Get profile counts by crawler type
|
|
*/
|
|
export async function getProfileStats(): Promise<{ crawlerType: string; count: number }[]> {
|
|
const result = await pool.query(
|
|
`SELECT crawler_type, COUNT(*) as count
|
|
FROM dispensary_crawler_profiles
|
|
WHERE enabled = true
|
|
GROUP BY crawler_type
|
|
ORDER BY count DESC`
|
|
);
|
|
|
|
return result.rows.map(row => ({
|
|
crawlerType: row.crawler_type,
|
|
count: parseInt(row.count, 10),
|
|
}));
|
|
}
|