feat: AZ dispensary harmonization with Dutchie source of truth

Major changes:
- Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API
- Add migration 057 for crawl_enabled and dutchie_verified fields
- Remove legacy dutchie-az module (replaced by platforms/dutchie)
- Clean up deprecated crawlers, scrapers, and orchestrator code
- Update location-discovery to not fallback to slug when ID is missing
- Add crawl-rotator service for proxy rotation
- Add types/index.ts for shared type definitions
- Add woodpecker-agent k8s manifest

Harmonization script:
- Queries ConsumerDispensaries API for all 32 AZ cities
- Matches dispensaries by platform_dispensary_id (not slug)
- Updates existing records with full Dutchie data
- Creates new records for unmatched Dutchie dispensaries
- Disables dispensaries not found in Dutchie

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-08 10:19:49 -07:00
parent 948a732dd5
commit b7cfec0770
112 changed files with 3163 additions and 34694 deletions

View File

@@ -0,0 +1,413 @@
/**
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers
*
* Manages rotation of proxies and user agents to avoid blocks.
* Used by platform-specific crawlers (Dutchie, Jane, etc.)
*
* Canonical location: src/services/crawl-rotator.ts
*/
import { Pool } from 'pg';
// ============================================================
// USER AGENT CONFIGURATION
// ============================================================
/**
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
* Updated: 2024
*/
export const USER_AGENTS = [
// Chrome on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
// Chrome on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
// Firefox on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
// Firefox on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
// Edge on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
];
// ============================================================
// PROXY TYPES
// ============================================================
export interface Proxy {
id: number;
host: string;
port: number;
username?: string;
password?: string;
protocol: 'http' | 'https' | 'socks5';
isActive: boolean;
lastUsedAt: Date | null;
failureCount: number;
successCount: number;
avgResponseTimeMs: number | null;
}
export interface ProxyStats {
totalProxies: number;
activeProxies: number;
blockedProxies: number;
avgSuccessRate: number;
}
// ============================================================
// PROXY ROTATOR CLASS
// ============================================================
export class ProxyRotator {
private pool: Pool | null = null;
private proxies: Proxy[] = [];
private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor(pool?: Pool) {
this.pool = pool || null;
}
/**
* Initialize with database pool
*/
setPool(pool: Pool): void {
this.pool = pool;
}
/**
* Load proxies from database
*/
async loadProxies(): Promise<void> {
if (!this.pool) {
console.warn('[ProxyRotator] No database pool configured');
return;
}
try {
const result = await this.pool.query<Proxy>(`
SELECT
id,
host,
port,
username,
password,
protocol,
is_active as "isActive",
last_used_at as "lastUsedAt",
failure_count as "failureCount",
success_count as "successCount",
avg_response_time_ms as "avgResponseTimeMs"
FROM proxies
WHERE is_active = true
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
`);
this.proxies = result.rows;
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies`);
} catch (error) {
// Table might not exist - that's okay
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
this.proxies = [];
}
}
/**
* Get next proxy in rotation
*/
getNext(): Proxy | null {
if (this.proxies.length === 0) return null;
// Round-robin rotation
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
this.lastRotation = new Date();
return this.proxies[this.currentIndex];
}
/**
* Get current proxy without rotating
*/
getCurrent(): Proxy | null {
if (this.proxies.length === 0) return null;
return this.proxies[this.currentIndex];
}
/**
* Get proxy by ID
*/
getById(id: number): Proxy | null {
return this.proxies.find(p => p.id === id) || null;
}
/**
* Rotate to a specific proxy
*/
setProxy(id: number): boolean {
const index = this.proxies.findIndex(p => p.id === id);
if (index === -1) return false;
this.currentIndex = index;
this.lastRotation = new Date();
return true;
}
/**
* Mark proxy as failed (temporarily remove from rotation)
*/
async markFailed(proxyId: number, error?: string): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.failureCount++;
// Deactivate if too many failures
if (proxy.failureCount >= 5) {
proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
failure_count = failure_count + 1,
last_failure_at = NOW(),
last_error = $2,
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
WHERE id = $1
`, [proxyId, error || null]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Mark proxy as successful
*/
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.successCount++;
proxy.lastUsedAt = new Date();
if (responseTimeMs !== undefined) {
// Rolling average
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
: responseTimeMs;
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
success_count = success_count + 1,
last_used_at = NOW(),
avg_response_time_ms = CASE
WHEN avg_response_time_ms IS NULL THEN $2
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
END
WHERE id = $1
`, [proxyId, responseTimeMs || null]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Get proxy URL for HTTP client
*/
getProxyUrl(proxy: Proxy): string {
const auth = proxy.username && proxy.password
? `${proxy.username}:${proxy.password}@`
: '';
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
}
/**
* Get stats about proxy pool
*/
getStats(): ProxyStats {
const totalProxies = this.proxies.length;
const activeProxies = this.proxies.filter(p => p.isActive).length;
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
const successRates = this.proxies
.filter(p => p.successCount + p.failureCount > 0)
.map(p => p.successCount / (p.successCount + p.failureCount));
const avgSuccessRate = successRates.length > 0
? successRates.reduce((a, b) => a + b, 0) / successRates.length
: 0;
return {
totalProxies,
activeProxies,
blockedProxies,
avgSuccessRate,
};
}
/**
* Check if proxy pool has available proxies
*/
hasAvailableProxies(): boolean {
return this.proxies.length > 0;
}
}
// ============================================================
// USER AGENT ROTATOR CLASS
// ============================================================
export class UserAgentRotator {
private userAgents: string[];
private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor(userAgents: string[] = USER_AGENTS) {
this.userAgents = userAgents;
// Start at random index to avoid patterns
this.currentIndex = Math.floor(Math.random() * userAgents.length);
}
/**
* Get next user agent in rotation
*/
getNext(): string {
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
this.lastRotation = new Date();
return this.userAgents[this.currentIndex];
}
/**
* Get current user agent without rotating
*/
getCurrent(): string {
return this.userAgents[this.currentIndex];
}
/**
* Get a random user agent
*/
getRandom(): string {
const index = Math.floor(Math.random() * this.userAgents.length);
return this.userAgents[index];
}
/**
* Get total available user agents
*/
getCount(): number {
return this.userAgents.length;
}
}
// ============================================================
// COMBINED ROTATOR (for convenience)
// ============================================================
export class CrawlRotator {
public proxy: ProxyRotator;
public userAgent: UserAgentRotator;
constructor(pool?: Pool) {
this.proxy = new ProxyRotator(pool);
this.userAgent = new UserAgentRotator();
}
/**
* Initialize rotator (load proxies from DB)
*/
async initialize(): Promise<void> {
await this.proxy.loadProxies();
}
/**
* Rotate proxy only
*/
rotateProxy(): Proxy | null {
return this.proxy.getNext();
}
/**
* Rotate user agent only
*/
rotateUserAgent(): string {
return this.userAgent.getNext();
}
/**
* Rotate both proxy and user agent
*/
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
return {
proxy: this.proxy.getNext(),
userAgent: this.userAgent.getNext(),
};
}
/**
* Get current proxy and user agent without rotating
*/
getCurrent(): { proxy: Proxy | null; userAgent: string } {
return {
proxy: this.proxy.getCurrent(),
userAgent: this.userAgent.getCurrent(),
};
}
/**
* Record success for current proxy
*/
async recordSuccess(responseTimeMs?: number): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markSuccess(current.id, responseTimeMs);
}
}
/**
* Record failure for current proxy
*/
async recordFailure(error?: string): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markFailed(current.id, error);
}
}
}
// ============================================================
// SINGLETON INSTANCES
// ============================================================
export const proxyRotator = new ProxyRotator();
export const userAgentRotator = new UserAgentRotator();
export const crawlRotator = new CrawlRotator();