feat: AZ dispensary harmonization with Dutchie source of truth

Major changes:
- Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API
- Add migration 057 for crawl_enabled and dutchie_verified fields
- Remove legacy dutchie-az module (replaced by platforms/dutchie)
- Clean up deprecated crawlers, scrapers, and orchestrator code
- Update location-discovery to not fallback to slug when ID is missing
- Add crawl-rotator service for proxy rotation
- Add types/index.ts for shared type definitions
- Add woodpecker-agent k8s manifest

Harmonization script:
- Queries ConsumerDispensaries API for all 32 AZ cities
- Matches dispensaries by platform_dispensary_id (not slug)
- Updates existing records with full Dutchie data
- Creates new records for unmatched Dutchie dispensaries
- Disables dispensaries not found in Dutchie

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-08 10:19:49 -07:00
parent 948a732dd5
commit b7cfec0770
112 changed files with 3163 additions and 34694 deletions

View File

@@ -1,53 +1,39 @@
/**
* Stores API Routes
*
* NOTE: "Store" and "Dispensary" are synonyms in CannaiQ.
* - This file handles `/api/stores` endpoints
* - The DB table is `dispensaries` (NOT `stores`)
* - Use these terms interchangeably
* - `/api/stores` and `/api/dispensaries` both work
*/
import { Router } from 'express';
import { authMiddleware, requireRole } from '../auth/middleware';
import { pool } from '../db/pool';
import { scrapeStore, scrapeCategory, discoverCategories } from '../scraper-v2';
const router = Router();
router.use(authMiddleware);
// Get all stores
router.get('/', async (req, res) => {
try {
const result = await pool.query(`
SELECT
s.*,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count
FROM stores s
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
GROUP BY s.id
ORDER BY s.name
`);
res.json({ stores: result.rows });
} catch (error) {
console.error('Error fetching stores:', error);
res.status(500).json({ error: 'Failed to fetch stores' });
}
});
// Freshness threshold in hours
const STALE_THRESHOLD_HOURS = 4;
function calculateFreshness(lastScrapedAt: Date | null): {
last_scraped_at: string | null;
function calculateFreshness(lastCrawlAt: Date | null): {
last_crawl_at: string | null;
is_stale: boolean;
freshness: string;
hours_since_scrape: number | null;
hours_since_crawl: number | null;
} {
if (!lastScrapedAt) {
if (!lastCrawlAt) {
return {
last_scraped_at: null,
last_crawl_at: null,
is_stale: true,
freshness: 'Never scraped',
hours_since_scrape: null
freshness: 'Never crawled',
hours_since_crawl: null
};
}
const now = new Date();
const diffMs = now.getTime() - lastScrapedAt.getTime();
const diffMs = now.getTime() - lastCrawlAt.getTime();
const diffHours = diffMs / (1000 * 60 * 60);
const isStale = diffHours > STALE_THRESHOLD_HOURS;
@@ -64,49 +50,123 @@ function calculateFreshness(lastScrapedAt: Date | null): {
}
return {
last_scraped_at: lastScrapedAt.toISOString(),
last_crawl_at: lastCrawlAt.toISOString(),
is_stale: isStale,
freshness: freshnessText,
hours_since_scrape: Math.round(diffHours * 10) / 10
hours_since_crawl: Math.round(diffHours * 10) / 10
};
}
function detectProvider(dutchieUrl: string | null): string {
if (!dutchieUrl) return 'unknown';
if (dutchieUrl.includes('dutchie.com')) return 'Dutchie';
if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co')) return 'Jane';
if (dutchieUrl.includes('treez.io')) return 'Treez';
if (dutchieUrl.includes('weedmaps.com')) return 'Weedmaps';
if (dutchieUrl.includes('leafly.com')) return 'Leafly';
function detectProvider(menuUrl: string | null): string {
if (!menuUrl) return 'unknown';
if (menuUrl.includes('dutchie.com')) return 'Dutchie';
if (menuUrl.includes('iheartjane.com') || menuUrl.includes('jane.co')) return 'Jane';
if (menuUrl.includes('treez.io')) return 'Treez';
if (menuUrl.includes('weedmaps.com')) return 'Weedmaps';
if (menuUrl.includes('leafly.com')) return 'Leafly';
return 'Custom';
}
// Get single store with full details
// Get all stores (from dispensaries table)
router.get('/', async (req, res) => {
try {
const { city, state, menu_type } = req.query;
let query = `
SELECT
id,
name,
slug,
city,
state,
address,
zip,
phone,
website,
latitude,
longitude,
menu_url,
menu_type,
platform,
platform_dispensary_id,
product_count,
last_crawl_at,
created_at,
updated_at
FROM dispensaries
`;
const params: any[] = [];
const conditions: string[] = [];
if (city) {
conditions.push(`city ILIKE $${params.length + 1}`);
params.push(city);
}
if (state) {
conditions.push(`state = $${params.length + 1}`);
params.push(state);
}
if (menu_type) {
conditions.push(`menu_type = $${params.length + 1}`);
params.push(menu_type);
}
if (conditions.length > 0) {
query += ` WHERE ${conditions.join(' AND ')}`;
}
query += ` ORDER BY name`;
const result = await pool.query(query, params);
// Add computed fields
const stores = result.rows.map(row => ({
...row,
provider: detectProvider(row.menu_url),
...calculateFreshness(row.last_crawl_at)
}));
res.json({ stores });
} catch (error) {
console.error('Error fetching stores:', error);
res.status(500).json({ error: 'Failed to fetch stores' });
}
});
// Get single store by ID (from dispensaries table)
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
// Get store with counts and linked dispensary
const result = await pool.query(`
SELECT
s.*,
d.id as dispensary_id,
d.name as dispensary_name,
d.slug as dispensary_slug,
d.state as dispensary_state,
d.city as dispensary_city,
d.address as dispensary_address,
d.menu_provider as dispensary_menu_provider,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
FROM stores s
LEFT JOIN dispensaries d ON s.dispensary_id = d.id
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
WHERE s.id = $1
GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
id,
name,
slug,
city,
state,
address,
zip,
phone,
website,
dba_name,
company_name,
latitude,
longitude,
menu_url,
menu_type,
platform,
platform_dispensary_id,
product_count,
last_crawl_at,
raw_metadata,
created_at,
updated_at
FROM dispensaries
WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
@@ -115,62 +175,19 @@ router.get('/:id', async (req, res) => {
const store = result.rows[0];
// Get recent crawl jobs for this store
const jobsResult = await pool.query(`
SELECT
id, status, job_type, trigger_type,
started_at, completed_at,
products_found, products_new, products_updated,
in_stock_count, out_of_stock_count,
error_message
FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT 10
`, [id]);
// Get schedule info if exists
const scheduleResult = await pool.query(`
SELECT
enabled, interval_hours, next_run_at, last_run_at
FROM store_crawl_schedule
WHERE store_id = $1
`, [id]);
// Calculate freshness
const freshness = calculateFreshness(store.last_scraped_at);
const freshness = calculateFreshness(store.last_crawl_at);
// Detect provider from URL
const provider = detectProvider(store.dutchie_url);
const provider = detectProvider(store.menu_url);
// Build response
const response = {
...store,
provider,
freshness: freshness.freshness,
is_stale: freshness.is_stale,
hours_since_scrape: freshness.hours_since_scrape,
linked_dispensary: store.dispensary_id ? {
id: store.dispensary_id,
name: store.dispensary_name,
slug: store.dispensary_slug,
state: store.dispensary_state,
city: store.dispensary_city,
address: store.dispensary_address,
menu_provider: store.dispensary_menu_provider
} : null,
schedule: scheduleResult.rows[0] || null,
recent_jobs: jobsResult.rows
...freshness,
};
// Remove redundant dispensary fields from root
delete response.dispensary_name;
delete response.dispensary_slug;
delete response.dispensary_state;
delete response.dispensary_city;
delete response.dispensary_address;
delete response.dispensary_menu_provider;
res.json(response);
} catch (error) {
console.error('Error fetching store:', error);
@@ -178,93 +195,106 @@ router.get('/:id', async (req, res) => {
}
});
// Get store brands
router.get('/:id/brands', async (req, res) => {
try {
const { id } = req.params;
const result = await pool.query(`
SELECT name
FROM brands
WHERE store_id = $1
ORDER BY name
`, [id]);
const brands = result.rows.map((row: any) => row.name);
res.json({ brands });
} catch (error) {
console.error('Error fetching store brands:', error);
res.status(500).json({ error: 'Failed to fetch store brands' });
}
});
// Get store specials
router.get('/:id/specials', async (req, res) => {
try {
const { id } = req.params;
const { date } = req.query;
// Use provided date or today's date
const queryDate = date || new Date().toISOString().split('T')[0];
const result = await pool.query(`
SELECT
s.*,
p.name as product_name,
p.image_url as product_image
FROM specials s
LEFT JOIN products p ON s.product_id = p.id
WHERE s.store_id = $1 AND s.valid_date = $2
ORDER BY s.name
`, [id, queryDate]);
res.json({ specials: result.rows, date: queryDate });
} catch (error) {
console.error('Error fetching store specials:', error);
res.status(500).json({ error: 'Failed to fetch store specials' });
}
});
// Create store
// Create store (into dispensaries table)
router.post('/', requireRole('superadmin', 'admin'), async (req, res) => {
try {
const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
const {
name,
slug,
city,
state,
address,
zip,
phone,
website,
menu_url,
menu_type,
platform,
platform_dispensary_id,
latitude,
longitude
} = req.body;
if (!name || !slug || !city || !state) {
return res.status(400).json({ error: 'name, slug, city, and state are required' });
}
const result = await pool.query(`
INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled)
VALUES ($1, $2, $3, $4, $5)
INSERT INTO dispensaries (
name, slug, city, state, address, zip, phone, website,
menu_url, menu_type, platform, platform_dispensary_id,
latitude, longitude, created_at, updated_at
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
RETURNING *
`, [name, slug, dutchie_url, active ?? true, scrape_enabled ?? true]);
`, [
name, slug, city, state, address, zip, phone, website,
menu_url, menu_type, platform || 'dutchie', platform_dispensary_id,
latitude, longitude
]);
res.status(201).json(result.rows[0]);
} catch (error) {
} catch (error: any) {
console.error('Error creating store:', error);
res.status(500).json({ error: 'Failed to create store' });
if (error.code === '23505') { // unique violation
res.status(409).json({ error: 'Store with this slug already exists' });
} else {
res.status(500).json({ error: 'Failed to create store' });
}
}
});
// Update store
// Update store (in dispensaries table)
router.put('/:id', requireRole('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
const {
name,
slug,
city,
state,
address,
zip,
phone,
website,
menu_url,
menu_type,
platform,
platform_dispensary_id,
latitude,
longitude
} = req.body;
const result = await pool.query(`
UPDATE stores
SET name = COALESCE($1, name),
slug = COALESCE($2, slug),
dutchie_url = COALESCE($3, dutchie_url),
active = COALESCE($4, active),
scrape_enabled = COALESCE($5, scrape_enabled),
updated_at = CURRENT_TIMESTAMP
WHERE id = $6
UPDATE dispensaries
SET
name = COALESCE($1, name),
slug = COALESCE($2, slug),
city = COALESCE($3, city),
state = COALESCE($4, state),
address = COALESCE($5, address),
zip = COALESCE($6, zip),
phone = COALESCE($7, phone),
website = COALESCE($8, website),
menu_url = COALESCE($9, menu_url),
menu_type = COALESCE($10, menu_type),
platform = COALESCE($11, platform),
platform_dispensary_id = COALESCE($12, platform_dispensary_id),
latitude = COALESCE($13, latitude),
longitude = COALESCE($14, longitude),
updated_at = CURRENT_TIMESTAMP
WHERE id = $15
RETURNING *
`, [name, slug, dutchie_url, active, scrape_enabled, id]);
`, [
name, slug, city, state, address, zip, phone, website,
menu_url, menu_type, platform, platform_dispensary_id,
latitude, longitude, id
]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result.rows[0]);
} catch (error) {
console.error('Error updating store:', error);
@@ -272,17 +302,17 @@ router.put('/:id', requireRole('superadmin', 'admin'), async (req, res) => {
}
});
// Delete store
// Delete store (from dispensaries table)
router.delete('/:id', requireRole('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await pool.query('DELETE FROM stores WHERE id = $1 RETURNING *', [id]);
const result = await pool.query('DELETE FROM dispensaries WHERE id = $1 RETURNING *', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json({ message: 'Store deleted successfully' });
} catch (error) {
console.error('Error deleting store:', error);
@@ -290,135 +320,55 @@ router.delete('/:id', requireRole('superadmin'), async (req, res) => {
}
});
// Trigger scrape for a store
router.post('/:id/scrape', requireRole('superadmin', 'admin'), async (req, res) => {
// Get products for a store (uses dutchie_products table)
router.get('/:id/products', async (req, res) => {
try {
const { id } = req.params;
const { parallel = 3, userAgent } = req.body; // Default to 3 parallel scrapers
const storeResult = await pool.query('SELECT id FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
scrapeStore(parseInt(id), parseInt(parallel), userAgent).catch(err => {
console.error('Background scrape error:', err);
});
res.json({
message: 'Scrape started',
parallel: parseInt(parallel),
userAgent: userAgent || 'random'
});
} catch (error) {
console.error('Error triggering scrape:', error);
res.status(500).json({ error: 'Failed to trigger scrape' });
}
});
// Download missing images for a store
router.post('/:id/download-images', requireRole('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const storeResult = await pool.query('SELECT id, name FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
const store = storeResult.rows[0];
const productsResult = await pool.query(`
SELECT id, name, image_url
FROM products
WHERE store_id = $1
AND image_url IS NOT NULL
AND local_image_path IS NULL
const result = await pool.query(`
SELECT
id,
name,
brand_name,
type,
subcategory,
stock_status,
thc_content,
cbd_content,
primary_image_url,
external_product_id,
created_at,
updated_at
FROM dutchie_products
WHERE dispensary_id = $1
ORDER BY name
`, [id]);
(async () => {
const { uploadImageFromUrl } = await import('../utils/minio');
let downloaded = 0;
for (const product of productsResult.rows) {
try {
console.log(`📸 Downloading image for: ${product.name}`);
const localPath = await uploadImageFromUrl(product.image_url, product.id);
await pool.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localPath, product.id]);
downloaded++;
} catch (error) {
console.error(`Failed to download image for ${product.name}:`, error);
}
}
console.log(`✅ Downloaded ${downloaded} of ${productsResult.rows.length} missing images for ${store.name}`);
})().catch(err => console.error('Background image download error:', err));
res.json({
message: 'Image download started',
total_missing: productsResult.rows.length
});
res.json({ products: result.rows });
} catch (error) {
console.error('Error triggering image download:', error);
res.status(500).json({ error: 'Failed to trigger image download' });
console.error('Error fetching store products:', error);
res.status(500).json({ error: 'Failed to fetch products' });
}
});
// Discover categories for a store
router.post('/:id/discover-categories', requireRole('superadmin', 'admin'), async (req, res) => {
// Get brands for a store
router.get('/:id/brands', async (req, res) => {
try {
const { id } = req.params;
const storeResult = await pool.query('SELECT id FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
discoverCategories(parseInt(id)).catch(err => {
console.error('Background category discovery error:', err);
});
res.json({ message: 'Category discovery started' });
} catch (error) {
console.error('Error triggering category discovery:', error);
res.status(500).json({ error: 'Failed to trigger category discovery' });
}
});
// Debug scraper
router.post('/:id/debug-scrape', requireRole('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
console.log('Debug scrape triggered for store:', id);
const categoryResult = await pool.query(`
SELECT c.dutchie_url, c.name
FROM categories c
WHERE c.store_id = $1 AND c.slug = 'edibles'
LIMIT 1
const result = await pool.query(`
SELECT DISTINCT brand_name as name, COUNT(*) as product_count
FROM dutchie_products
WHERE dispensary_id = $1 AND brand_name IS NOT NULL
GROUP BY brand_name
ORDER BY product_count DESC, brand_name
`, [id]);
if (categoryResult.rows.length === 0) {
return res.status(404).json({ error: 'Edibles category not found' });
}
console.log('Found category:', categoryResult.rows[0]);
const { debugDutchiePage } = await import('../services/scraper-debug');
debugDutchiePage(categoryResult.rows[0].dutchie_url).catch(err => {
console.error('Debug error:', err);
});
res.json({ message: 'Debug started, check logs', url: categoryResult.rows[0].dutchie_url });
const brands = result.rows.map((row: any) => row.name);
res.json({ brands, details: result.rows });
} catch (error) {
console.error('Debug endpoint error:', error);
res.status(500).json({ error: 'Failed to debug' });
console.error('Error fetching store brands:', error);
res.status(500).json({ error: 'Failed to fetch store brands' });
}
});