feat: AZ dispensary harmonization with Dutchie source of truth
Major changes: - Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API - Add migration 057 for crawl_enabled and dutchie_verified fields - Remove legacy dutchie-az module (replaced by platforms/dutchie) - Clean up deprecated crawlers, scrapers, and orchestrator code - Update location-discovery to not fallback to slug when ID is missing - Add crawl-rotator service for proxy rotation - Add types/index.ts for shared type definitions - Add woodpecker-agent k8s manifest Harmonization script: - Queries ConsumerDispensaries API for all 32 AZ cities - Matches dispensaries by platform_dispensary_id (not slug) - Updates existing records with full Dutchie data - Creates new records for unmatched Dutchie dispensaries - Disables dispensaries not found in Dutchie 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
413
backend/src/services/crawl-rotator.ts
Normal file
413
backend/src/services/crawl-rotator.ts
Normal file
@@ -0,0 +1,413 @@
|
||||
/**
|
||||
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers
|
||||
*
|
||||
* Manages rotation of proxies and user agents to avoid blocks.
|
||||
* Used by platform-specific crawlers (Dutchie, Jane, etc.)
|
||||
*
|
||||
* Canonical location: src/services/crawl-rotator.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
|
||||
* Updated: 2024
|
||||
*/
|
||||
export const USER_AGENTS = [
|
||||
// Chrome on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
|
||||
|
||||
// Chrome on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
|
||||
// Firefox on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||
|
||||
// Firefox on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
|
||||
// Safari on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
|
||||
// Edge on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
];
|
||||
|
||||
// ============================================================
|
||||
// PROXY TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface Proxy {
|
||||
id: number;
|
||||
host: string;
|
||||
port: number;
|
||||
username?: string;
|
||||
password?: string;
|
||||
protocol: 'http' | 'https' | 'socks5';
|
||||
isActive: boolean;
|
||||
lastUsedAt: Date | null;
|
||||
failureCount: number;
|
||||
successCount: number;
|
||||
avgResponseTimeMs: number | null;
|
||||
}
|
||||
|
||||
export interface ProxyStats {
|
||||
totalProxies: number;
|
||||
activeProxies: number;
|
||||
blockedProxies: number;
|
||||
avgSuccessRate: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROXY ROTATOR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class ProxyRotator {
|
||||
private pool: Pool | null = null;
|
||||
private proxies: Proxy[] = [];
|
||||
private currentIndex: number = 0;
|
||||
private lastRotation: Date = new Date();
|
||||
|
||||
constructor(pool?: Pool) {
|
||||
this.pool = pool || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize with database pool
|
||||
*/
|
||||
setPool(pool: Pool): void {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load proxies from database
|
||||
*/
|
||||
async loadProxies(): Promise<void> {
|
||||
if (!this.pool) {
|
||||
console.warn('[ProxyRotator] No database pool configured');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query<Proxy>(`
|
||||
SELECT
|
||||
id,
|
||||
host,
|
||||
port,
|
||||
username,
|
||||
password,
|
||||
protocol,
|
||||
is_active as "isActive",
|
||||
last_used_at as "lastUsedAt",
|
||||
failure_count as "failureCount",
|
||||
success_count as "successCount",
|
||||
avg_response_time_ms as "avgResponseTimeMs"
|
||||
FROM proxies
|
||||
WHERE is_active = true
|
||||
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
|
||||
`);
|
||||
|
||||
this.proxies = result.rows;
|
||||
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies`);
|
||||
} catch (error) {
|
||||
// Table might not exist - that's okay
|
||||
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
|
||||
this.proxies = [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get next proxy in rotation
|
||||
*/
|
||||
getNext(): Proxy | null {
|
||||
if (this.proxies.length === 0) return null;
|
||||
|
||||
// Round-robin rotation
|
||||
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
|
||||
this.lastRotation = new Date();
|
||||
|
||||
return this.proxies[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy without rotating
|
||||
*/
|
||||
getCurrent(): Proxy | null {
|
||||
if (this.proxies.length === 0) return null;
|
||||
return this.proxies[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get proxy by ID
|
||||
*/
|
||||
getById(id: number): Proxy | null {
|
||||
return this.proxies.find(p => p.id === id) || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate to a specific proxy
|
||||
*/
|
||||
setProxy(id: number): boolean {
|
||||
const index = this.proxies.findIndex(p => p.id === id);
|
||||
if (index === -1) return false;
|
||||
|
||||
this.currentIndex = index;
|
||||
this.lastRotation = new Date();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as failed (temporarily remove from rotation)
|
||||
*/
|
||||
async markFailed(proxyId: number, error?: string): Promise<void> {
|
||||
// Update in-memory
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.failureCount++;
|
||||
|
||||
// Deactivate if too many failures
|
||||
if (proxy.failureCount >= 5) {
|
||||
proxy.isActive = false;
|
||||
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
failure_count = failure_count + 1,
|
||||
last_failure_at = NOW(),
|
||||
last_error = $2,
|
||||
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
|
||||
WHERE id = $1
|
||||
`, [proxyId, error || null]);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as successful
|
||||
*/
|
||||
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
|
||||
// Update in-memory
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.successCount++;
|
||||
proxy.lastUsedAt = new Date();
|
||||
if (responseTimeMs !== undefined) {
|
||||
// Rolling average
|
||||
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
|
||||
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
|
||||
: responseTimeMs;
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
success_count = success_count + 1,
|
||||
last_used_at = NOW(),
|
||||
avg_response_time_ms = CASE
|
||||
WHEN avg_response_time_ms IS NULL THEN $2
|
||||
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
|
||||
END
|
||||
WHERE id = $1
|
||||
`, [proxyId, responseTimeMs || null]);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get proxy URL for HTTP client
|
||||
*/
|
||||
getProxyUrl(proxy: Proxy): string {
|
||||
const auth = proxy.username && proxy.password
|
||||
? `${proxy.username}:${proxy.password}@`
|
||||
: '';
|
||||
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stats about proxy pool
|
||||
*/
|
||||
getStats(): ProxyStats {
|
||||
const totalProxies = this.proxies.length;
|
||||
const activeProxies = this.proxies.filter(p => p.isActive).length;
|
||||
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
|
||||
|
||||
const successRates = this.proxies
|
||||
.filter(p => p.successCount + p.failureCount > 0)
|
||||
.map(p => p.successCount / (p.successCount + p.failureCount));
|
||||
|
||||
const avgSuccessRate = successRates.length > 0
|
||||
? successRates.reduce((a, b) => a + b, 0) / successRates.length
|
||||
: 0;
|
||||
|
||||
return {
|
||||
totalProxies,
|
||||
activeProxies,
|
||||
blockedProxies,
|
||||
avgSuccessRate,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy pool has available proxies
|
||||
*/
|
||||
hasAvailableProxies(): boolean {
|
||||
return this.proxies.length > 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT ROTATOR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class UserAgentRotator {
|
||||
private userAgents: string[];
|
||||
private currentIndex: number = 0;
|
||||
private lastRotation: Date = new Date();
|
||||
|
||||
constructor(userAgents: string[] = USER_AGENTS) {
|
||||
this.userAgents = userAgents;
|
||||
// Start at random index to avoid patterns
|
||||
this.currentIndex = Math.floor(Math.random() * userAgents.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get next user agent in rotation
|
||||
*/
|
||||
getNext(): string {
|
||||
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
|
||||
this.lastRotation = new Date();
|
||||
return this.userAgents[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current user agent without rotating
|
||||
*/
|
||||
getCurrent(): string {
|
||||
return this.userAgents[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a random user agent
|
||||
*/
|
||||
getRandom(): string {
|
||||
const index = Math.floor(Math.random() * this.userAgents.length);
|
||||
return this.userAgents[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total available user agents
|
||||
*/
|
||||
getCount(): number {
|
||||
return this.userAgents.length;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COMBINED ROTATOR (for convenience)
|
||||
// ============================================================
|
||||
|
||||
export class CrawlRotator {
|
||||
public proxy: ProxyRotator;
|
||||
public userAgent: UserAgentRotator;
|
||||
|
||||
constructor(pool?: Pool) {
|
||||
this.proxy = new ProxyRotator(pool);
|
||||
this.userAgent = new UserAgentRotator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize rotator (load proxies from DB)
|
||||
*/
|
||||
async initialize(): Promise<void> {
|
||||
await this.proxy.loadProxies();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate proxy only
|
||||
*/
|
||||
rotateProxy(): Proxy | null {
|
||||
return this.proxy.getNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate user agent only
|
||||
*/
|
||||
rotateUserAgent(): string {
|
||||
return this.userAgent.getNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate both proxy and user agent
|
||||
*/
|
||||
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
|
||||
return {
|
||||
proxy: this.proxy.getNext(),
|
||||
userAgent: this.userAgent.getNext(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy and user agent without rotating
|
||||
*/
|
||||
getCurrent(): { proxy: Proxy | null; userAgent: string } {
|
||||
return {
|
||||
proxy: this.proxy.getCurrent(),
|
||||
userAgent: this.userAgent.getCurrent(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Record success for current proxy
|
||||
*/
|
||||
async recordSuccess(responseTimeMs?: number): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (current) {
|
||||
await this.proxy.markSuccess(current.id, responseTimeMs);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record failure for current proxy
|
||||
*/
|
||||
async recordFailure(error?: string): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (current) {
|
||||
await this.proxy.markFailed(current.id, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLETON INSTANCES
|
||||
// ============================================================
|
||||
|
||||
export const proxyRotator = new ProxyRotator();
|
||||
export const userAgentRotator = new UserAgentRotator();
|
||||
export const crawlRotator = new CrawlRotator();
|
||||
Reference in New Issue
Block a user