feat: AZ dispensary harmonization with Dutchie source of truth

Major changes:
- Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API
- Add migration 057 for crawl_enabled and dutchie_verified fields
- Remove legacy dutchie-az module (replaced by platforms/dutchie)
- Clean up deprecated crawlers, scrapers, and orchestrator code
- Update location-discovery to not fallback to slug when ID is missing
- Add crawl-rotator service for proxy rotation
- Add types/index.ts for shared type definitions
- Add woodpecker-agent k8s manifest

Harmonization script:
- Queries ConsumerDispensaries API for all 32 AZ cities
- Matches dispensaries by platform_dispensary_id (not slug)
- Updates existing records with full Dutchie data
- Creates new records for unmatched Dutchie dispensaries
- Disables dispensaries not found in Dutchie

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-08 10:19:49 -07:00
parent 948a732dd5
commit b7cfec0770
112 changed files with 3163 additions and 34694 deletions

View File

@@ -0,0 +1,114 @@
/**
* Debug Dutchie city page to see what data is available
*/
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
async function main() {
const cityUrl = process.argv[2] || 'https://dutchie.com/us/dispensaries/wa-bellevue';
console.log(`Debugging page: ${cityUrl}`);
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
console.log('Navigating...');
await page.goto(cityUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 5000));
// Get page title
const title = await page.title();
console.log(`\nPage title: ${title}`);
// Check for Cloudflare challenge
const isCFChallenge = await page.evaluate(() => {
return document.title.includes('Just a moment') ||
document.body.textContent?.includes('Enable JavaScript');
});
if (isCFChallenge) {
console.log('\n⚠ CLOUDFLARE CHALLENGE DETECTED - waiting longer...');
await new Promise((r) => setTimeout(r, 10000));
}
// Check for __NEXT_DATA__
const nextData = await page.evaluate(() => {
const script = document.querySelector('script#__NEXT_DATA__');
if (script) {
try {
return JSON.parse(script.textContent || '{}');
} catch {
return { error: 'Failed to parse __NEXT_DATA__' };
}
}
return null;
});
if (nextData) {
console.log('\n✅ __NEXT_DATA__ found!');
console.log('Keys:', Object.keys(nextData));
if (nextData.props?.pageProps) {
console.log('pageProps keys:', Object.keys(nextData.props.pageProps));
if (nextData.props.pageProps.dispensaries) {
console.log('Dispensaries count:', nextData.props.pageProps.dispensaries.length);
// Show first dispensary structure
const first = nextData.props.pageProps.dispensaries[0];
if (first) {
console.log('\nFirst dispensary keys:', Object.keys(first));
console.log('First dispensary sample:', JSON.stringify(first, null, 2).slice(0, 1000));
}
}
}
} else {
console.log('\n❌ No __NEXT_DATA__ found');
// Check what scripts are on the page
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script[id]')).map(s => ({
id: s.id,
src: (s as HTMLScriptElement).src?.slice(0, 100),
}));
});
console.log('Scripts with IDs:', scripts);
// Try to find dispensary data in window object
const windowData = await page.evaluate(() => {
const w = window as any;
const keys = ['__NEXT_DATA__', '__PRELOADED_STATE__', '__INITIAL_STATE__',
'dispensaries', '__data', 'pageData', '__remixContext'];
const found: Record<string, any> = {};
for (const key of keys) {
if (w[key]) {
found[key] = typeof w[key] === 'object' ? Object.keys(w[key]) : typeof w[key];
}
}
return found;
});
console.log('Window data:', windowData);
// Get some page content
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500));
console.log('\nPage text preview:', bodyText);
}
} finally {
await browser.close();
}
}
main().catch(console.error);