fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -0,0 +1,380 @@
"use strict";
/**
* Directory-Based Store Matcher
*
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
* then matches them to existing dispensaries by fuzzy name/city/address matching.
*
* This allows us to:
* 1. Find specific store URLs for directory-style websites
* 2. Match stores confidently by name+city
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeSolDirectory = scrapeSolDirectory;
exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
exports.previewDirectoryMatches = previewDirectoryMatches;
exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
const connection_1 = require("../db/connection");
// ============================================================
// NORMALIZATION FUNCTIONS
// ============================================================
/**
* Normalize a string for comparison:
* - Lowercase
* - Remove common suffixes (dispensary, cannabis, etc.)
* - Remove punctuation
* - Collapse whitespace
*/
function normalizeForComparison(str) {
if (!str)
return '';
return str
.toLowerCase()
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
.replace(/[^\w\s]/g, ' ') // Remove punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Normalize city name for comparison
*/
function normalizeCity(city) {
if (!city)
return '';
return city
.toLowerCase()
.replace(/[^\w\s]/g, '')
.trim();
}
/**
* Calculate similarity between two strings (0-1)
* Uses Levenshtein distance normalized by max length
*/
function stringSimilarity(a, b) {
if (!a || !b)
return 0;
if (a === b)
return 1;
const longer = a.length > b.length ? a : b;
const shorter = a.length > b.length ? b : a;
if (longer.length === 0)
return 1;
const distance = levenshteinDistance(longer, shorter);
return (longer.length - distance) / longer.length;
}
/**
* Levenshtein distance between two strings
*/
function levenshteinDistance(a, b) {
const matrix = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
}
else {
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Check if string contains another (with normalization)
*/
function containsNormalized(haystack, needle) {
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
}
// ============================================================
// PROVIDER DIRECTORY SCRAPERS
// ============================================================
/**
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
*/
async function scrapeSolDirectory() {
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
try {
const response = await fetch('https://www.livewithsol.com/locations/', {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
Accept: 'text/html',
},
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const html = await response.text();
// Extract store entries from HTML
// Sol's structure: Each location has name, address in specific divs
const stores = [];
// Pattern to find location cards
// Format: <a href="/locations/slug/">NAME</a> with address nearby
const locationRegex = /<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
let match;
while ((match = locationRegex.exec(html)) !== null) {
const [, path, name, address] = match;
// Extract city from common Arizona cities
let city = 'Unknown';
const cityPatterns = [
{ pattern: /phoenix/i, city: 'Phoenix' },
{ pattern: /scottsdale/i, city: 'Scottsdale' },
{ pattern: /tempe/i, city: 'Tempe' },
{ pattern: /tucson/i, city: 'Tucson' },
{ pattern: /mesa/i, city: 'Mesa' },
{ pattern: /sun city/i, city: 'Sun City' },
{ pattern: /glendale/i, city: 'Glendale' },
];
for (const { pattern, city: cityName } of cityPatterns) {
if (pattern.test(name) || pattern.test(address)) {
city = cityName;
break;
}
}
stores.push({
name: name.trim(),
city,
state: 'AZ',
address: address.trim(),
storeUrl: `https://www.livewithsol.com${path}`,
});
}
// If regex didn't work, use known hardcoded values (fallback)
if (stores.length === 0) {
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
return [
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
];
}
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
return stores;
}
catch (error) {
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
// Return hardcoded fallback
return [
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
];
}
}
/**
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
* In production, this would use Playwright to bypass age-gate
*/
async function scrapeCuraleafDirectory() {
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
// Hardcoded Arizona Curaleaf locations from public knowledge
// These would be scraped via Playwright in production
return [
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
];
}
/**
* Match a directory store to an existing dispensary
*/
function matchStoreToDispensary(store, dispensaries) {
const normalizedStoreName = normalizeForComparison(store.name);
const normalizedStoreCity = normalizeCity(store.city);
let bestMatch = null;
let bestScore = 0;
let matchReason = '';
for (const disp of dispensaries) {
const normalizedDispName = normalizeForComparison(disp.name);
const normalizedDispCity = normalizeCity(disp.city || '');
let score = 0;
const reasons = [];
// 1. Name similarity (max 50 points)
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
score += nameSimilarity * 50;
if (nameSimilarity > 0.8)
reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
// 2. City match (25 points for exact, 15 for partial)
if (normalizedStoreCity && normalizedDispCity) {
if (normalizedStoreCity === normalizedDispCity) {
score += 25;
reasons.push('city_exact');
}
else if (normalizedStoreCity.includes(normalizedDispCity) ||
normalizedDispCity.includes(normalizedStoreCity)) {
score += 15;
reasons.push('city_partial');
}
}
// 3. Address contains street name (15 points)
if (store.address && disp.address) {
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
score += 15;
reasons.push('address_match');
}
}
// 4. Brand name in dispensary name (10 points)
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
if (disp.name.toLowerCase().includes(brandName)) {
score += 10;
reasons.push('brand_match');
}
if (score > bestScore) {
bestScore = score;
bestMatch = disp;
matchReason = reasons.join(', ');
}
}
// Determine confidence level
let confidence;
if (bestScore >= 70) {
confidence = 'high';
}
else if (bestScore >= 50) {
confidence = 'medium';
}
else if (bestScore >= 30) {
confidence = 'low';
}
else {
confidence = 'none';
}
return {
directoryStore: store,
dispensaryId: bestMatch?.id || null,
dispensaryName: bestMatch?.name || null,
confidence,
matchReason: matchReason || 'no_match',
};
}
// ============================================================
// MAIN FUNCTIONS
// ============================================================
/**
* Run directory matching for a provider and update database
* Only applies high-confidence matches automatically
*/
async function matchDirectoryToDispensaries(provider, dryRun = true) {
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
// Get directory stores
let directoryStores;
if (provider === 'curaleaf') {
directoryStores = await scrapeCuraleafDirectory();
}
else if (provider === 'sol') {
directoryStores = await scrapeSolDirectory();
}
else {
throw new Error(`Unknown provider: ${provider}`);
}
// Get all AZ dispensaries from database
const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
FROM dispensaries
WHERE state = 'AZ'`);
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
// Match each directory store
const results = [];
for (const store of directoryStores) {
const match = matchStoreToDispensary(store, dispensaries);
results.push(match);
// Only apply high-confidence matches if not dry run
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
await applyDirectoryMatch(match.dispensaryId, provider, store);
}
}
// Count results
const report = {
provider,
totalDirectoryStores: directoryStores.length,
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
unmatched: results.filter((r) => r.confidence === 'none').length,
results,
};
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
console.log(` - High confidence: ${report.highConfidenceMatches}`);
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
console.log(` - Unmatched: ${report.unmatched}`);
return report;
}
/**
* Apply a directory match to a dispensary
*/
async function applyDirectoryMatch(dispensaryId, provider, store) {
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = $1,
menu_url = $2,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'directory_match'::text,
'detected_at', NOW(),
'directory_store_name', $3::text,
'directory_store_url', $2::text,
'directory_store_city', $4::text,
'directory_store_address', $5::text,
'not_crawlable', true,
'not_crawlable_reason', $6::text
),
updated_at = NOW()
WHERE id = $7
`, [
provider,
store.storeUrl,
store.name,
store.city,
store.address,
`${provider} proprietary menu - no crawler available`,
dispensaryId,
]);
}
/**
* Preview matches without applying them
*/
async function previewDirectoryMatches(provider) {
return matchDirectoryToDispensaries(provider, true);
}
/**
* Apply high-confidence matches
*/
async function applyHighConfidenceMatches(provider) {
return matchDirectoryToDispensaries(provider, false);
}