Files
cannaiq/backend/src/_deprecated/services/crawler-profiles.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

364 lines
10 KiB
TypeScript

/**
* Crawler Profiles Service
*
* Manages per-store crawler configuration profiles.
* This service handles CRUD operations for dispensary_crawler_profiles
* and provides helper functions for loading active profiles.
*
* Phase 1: Basic profile loading for Dutchie production crawls only.
*/
import { pool } from '../db/pool';
import {
DispensaryCrawlerProfile,
DispensaryCrawlerProfileCreate,
DispensaryCrawlerProfileUpdate,
CrawlerProfileOptions,
} from '../types';
// ============================================================
// Database Row Mapping
// ============================================================
/**
* Map database row (snake_case) to TypeScript interface (camelCase)
*/
function mapDbRowToProfile(row: any): DispensaryCrawlerProfile {
return {
id: row.id,
dispensaryId: row.dispensary_id,
profileName: row.profile_name,
crawlerType: row.crawler_type,
profileKey: row.profile_key,
config: row.config || {},
timeoutMs: row.timeout_ms,
downloadImages: row.download_images,
trackStock: row.track_stock,
version: row.version,
enabled: row.enabled,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
// ============================================================
// Profile Retrieval
// ============================================================
/**
* Get the active crawler profile for a dispensary.
*
* Resolution order:
* 1. If dispensaries.active_crawler_profile_id is set, load that profile (if enabled)
* 2. Otherwise, find the most recently created enabled profile matching the dispensary's
* menu_type (for Dutchie, crawler_type = 'dutchie')
* 3. Returns null if no matching profile exists
*
* @param dispensaryId - The dispensary ID to look up
* @param crawlerType - Optional: filter by crawler type (defaults to checking menu_type)
*/
export async function getActiveCrawlerProfileForDispensary(
dispensaryId: number,
crawlerType?: string
): Promise<DispensaryCrawlerProfile | null> {
// First, check if there's an explicit active_crawler_profile_id set
const activeProfileResult = await pool.query(
`SELECT dcp.*
FROM dispensary_crawler_profiles dcp
INNER JOIN dispensaries d ON d.active_crawler_profile_id = dcp.id
WHERE d.id = $1 AND dcp.enabled = true`,
[dispensaryId]
);
if (activeProfileResult.rows.length > 0) {
return mapDbRowToProfile(activeProfileResult.rows[0]);
}
// No explicit active profile - fall back to most recent enabled profile
// If crawlerType not specified, try to match dispensary's menu_type
let effectiveCrawlerType = crawlerType;
if (!effectiveCrawlerType) {
const dispensaryResult = await pool.query(
`SELECT menu_type FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispensaryResult.rows.length > 0 && dispensaryResult.rows[0].menu_type) {
effectiveCrawlerType = dispensaryResult.rows[0].menu_type;
}
}
// If we still don't have a crawler type, default to 'dutchie' for Phase 1
if (!effectiveCrawlerType) {
effectiveCrawlerType = 'dutchie';
}
const fallbackResult = await pool.query(
`SELECT * FROM dispensary_crawler_profiles
WHERE dispensary_id = $1
AND crawler_type = $2
AND enabled = true
ORDER BY created_at DESC
LIMIT 1`,
[dispensaryId, effectiveCrawlerType]
);
if (fallbackResult.rows.length > 0) {
return mapDbRowToProfile(fallbackResult.rows[0]);
}
return null;
}
/**
* Get all profiles for a dispensary
*/
export async function getProfilesForDispensary(
dispensaryId: number
): Promise<DispensaryCrawlerProfile[]> {
const result = await pool.query(
`SELECT * FROM dispensary_crawler_profiles
WHERE dispensary_id = $1
ORDER BY created_at DESC`,
[dispensaryId]
);
return result.rows.map(mapDbRowToProfile);
}
/**
* Get a profile by ID
*/
export async function getProfileById(
profileId: number
): Promise<DispensaryCrawlerProfile | null> {
const result = await pool.query(
`SELECT * FROM dispensary_crawler_profiles WHERE id = $1`,
[profileId]
);
if (result.rows.length === 0) {
return null;
}
return mapDbRowToProfile(result.rows[0]);
}
// ============================================================
// Profile Creation & Update
// ============================================================
/**
* Create a new crawler profile
*/
export async function createCrawlerProfile(
profile: DispensaryCrawlerProfileCreate
): Promise<DispensaryCrawlerProfile> {
const result = await pool.query(
`INSERT INTO dispensary_crawler_profiles (
dispensary_id, profile_name, crawler_type, profile_key,
config, timeout_ms, download_images, track_stock, version, enabled
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
RETURNING *`,
[
profile.dispensaryId,
profile.profileName,
profile.crawlerType,
profile.profileKey ?? null,
JSON.stringify(profile.config ?? {}),
profile.timeoutMs ?? 30000,
profile.downloadImages ?? true,
profile.trackStock ?? true,
profile.version ?? 1,
profile.enabled ?? true,
]
);
return mapDbRowToProfile(result.rows[0]);
}
/**
* Update an existing profile
*/
export async function updateCrawlerProfile(
profileId: number,
updates: DispensaryCrawlerProfileUpdate
): Promise<DispensaryCrawlerProfile | null> {
// Build dynamic update query
const setClauses: string[] = [];
const values: any[] = [];
let paramIndex = 1;
if (updates.profileName !== undefined) {
setClauses.push(`profile_name = $${paramIndex++}`);
values.push(updates.profileName);
}
if (updates.crawlerType !== undefined) {
setClauses.push(`crawler_type = $${paramIndex++}`);
values.push(updates.crawlerType);
}
if (updates.profileKey !== undefined) {
setClauses.push(`profile_key = $${paramIndex++}`);
values.push(updates.profileKey);
}
if (updates.config !== undefined) {
setClauses.push(`config = $${paramIndex++}`);
values.push(JSON.stringify(updates.config));
}
if (updates.timeoutMs !== undefined) {
setClauses.push(`timeout_ms = $${paramIndex++}`);
values.push(updates.timeoutMs);
}
if (updates.downloadImages !== undefined) {
setClauses.push(`download_images = $${paramIndex++}`);
values.push(updates.downloadImages);
}
if (updates.trackStock !== undefined) {
setClauses.push(`track_stock = $${paramIndex++}`);
values.push(updates.trackStock);
}
if (updates.version !== undefined) {
setClauses.push(`version = $${paramIndex++}`);
values.push(updates.version);
}
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (setClauses.length === 0) {
// Nothing to update
return getProfileById(profileId);
}
values.push(profileId);
const result = await pool.query(
`UPDATE dispensary_crawler_profiles
SET ${setClauses.join(', ')}
WHERE id = $${paramIndex}
RETURNING *`,
values
);
if (result.rows.length === 0) {
return null;
}
return mapDbRowToProfile(result.rows[0]);
}
/**
* Delete a profile (hard delete - use updateCrawlerProfile with enabled=false for soft delete)
*/
export async function deleteCrawlerProfile(profileId: number): Promise<boolean> {
// First clear any active_crawler_profile_id references
await pool.query(
`UPDATE dispensaries SET active_crawler_profile_id = NULL
WHERE active_crawler_profile_id = $1`,
[profileId]
);
const result = await pool.query(
`DELETE FROM dispensary_crawler_profiles WHERE id = $1`,
[profileId]
);
return (result.rowCount ?? 0) > 0;
}
// ============================================================
// Active Profile Management
// ============================================================
/**
* Set the active crawler profile for a dispensary
*/
export async function setActiveCrawlerProfile(
dispensaryId: number,
profileId: number
): Promise<void> {
// Verify the profile belongs to this dispensary and is enabled
const profile = await getProfileById(profileId);
if (!profile) {
throw new Error(`Profile ${profileId} not found`);
}
if (profile.dispensaryId !== dispensaryId) {
throw new Error(`Profile ${profileId} does not belong to dispensary ${dispensaryId}`);
}
if (!profile.enabled) {
throw new Error(`Profile ${profileId} is not enabled`);
}
await pool.query(
`UPDATE dispensaries SET active_crawler_profile_id = $1 WHERE id = $2`,
[profileId, dispensaryId]
);
}
/**
* Clear the active crawler profile for a dispensary
*/
export async function clearActiveCrawlerProfile(dispensaryId: number): Promise<void> {
await pool.query(
`UPDATE dispensaries SET active_crawler_profile_id = NULL WHERE id = $1`,
[dispensaryId]
);
}
// ============================================================
// Helper Functions
// ============================================================
/**
* Convert a profile to runtime options for the crawler
*/
export function profileToOptions(profile: DispensaryCrawlerProfile): CrawlerProfileOptions {
return {
timeoutMs: profile.timeoutMs ?? 30000,
downloadImages: profile.downloadImages,
trackStock: profile.trackStock,
config: profile.config,
};
}
/**
* Get default options when no profile is configured
*/
export function getDefaultCrawlerOptions(): CrawlerProfileOptions {
return {
timeoutMs: 30000,
downloadImages: true,
trackStock: true,
config: {},
};
}
/**
* Check if a dispensary has any profiles
*/
export async function dispensaryHasProfiles(dispensaryId: number): Promise<boolean> {
const result = await pool.query(
`SELECT EXISTS(SELECT 1 FROM dispensary_crawler_profiles WHERE dispensary_id = $1) as has_profiles`,
[dispensaryId]
);
return result.rows[0]?.has_profiles ?? false;
}
/**
* Get profile counts by crawler type
*/
export async function getProfileStats(): Promise<{ crawlerType: string; count: number }[]> {
const result = await pool.query(
`SELECT crawler_type, COUNT(*) as count
FROM dispensary_crawler_profiles
WHERE enabled = true
GROUP BY crawler_type
ORDER BY count DESC`
);
return result.rows.map(row => ({
crawlerType: row.crawler_type,
count: parseInt(row.count, 10),
}));
}