feat: AZ dispensary harmonization with Dutchie source of truth
Major changes: - Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API - Add migration 057 for crawl_enabled and dutchie_verified fields - Remove legacy dutchie-az module (replaced by platforms/dutchie) - Clean up deprecated crawlers, scrapers, and orchestrator code - Update location-discovery to not fallback to slug when ID is missing - Add crawl-rotator service for proxy rotation - Add types/index.ts for shared type definitions - Add woodpecker-agent k8s manifest Harmonization script: - Queries ConsumerDispensaries API for all 32 AZ cities - Matches dispensaries by platform_dispensary_id (not slug) - Updates existing records with full Dutchie data - Creates new records for unmatched Dutchie dispensaries - Disables dispensaries not found in Dutchie 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
114
backend/src/scripts/debug-dutchie-page.ts
Normal file
114
backend/src/scripts/debug-dutchie-page.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
/**
|
||||
* Debug Dutchie city page to see what data is available
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
async function main() {
|
||||
const cityUrl = process.argv[2] || 'https://dutchie.com/us/dispensaries/wa-bellevue';
|
||||
|
||||
console.log(`Debugging page: ${cityUrl}`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('Navigating...');
|
||||
await page.goto(cityUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
// Get page title
|
||||
const title = await page.title();
|
||||
console.log(`\nPage title: ${title}`);
|
||||
|
||||
// Check for Cloudflare challenge
|
||||
const isCFChallenge = await page.evaluate(() => {
|
||||
return document.title.includes('Just a moment') ||
|
||||
document.body.textContent?.includes('Enable JavaScript');
|
||||
});
|
||||
|
||||
if (isCFChallenge) {
|
||||
console.log('\n⚠️ CLOUDFLARE CHALLENGE DETECTED - waiting longer...');
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
}
|
||||
|
||||
// Check for __NEXT_DATA__
|
||||
const nextData = await page.evaluate(() => {
|
||||
const script = document.querySelector('script#__NEXT_DATA__');
|
||||
if (script) {
|
||||
try {
|
||||
return JSON.parse(script.textContent || '{}');
|
||||
} catch {
|
||||
return { error: 'Failed to parse __NEXT_DATA__' };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (nextData) {
|
||||
console.log('\n✅ __NEXT_DATA__ found!');
|
||||
console.log('Keys:', Object.keys(nextData));
|
||||
if (nextData.props?.pageProps) {
|
||||
console.log('pageProps keys:', Object.keys(nextData.props.pageProps));
|
||||
if (nextData.props.pageProps.dispensaries) {
|
||||
console.log('Dispensaries count:', nextData.props.pageProps.dispensaries.length);
|
||||
// Show first dispensary structure
|
||||
const first = nextData.props.pageProps.dispensaries[0];
|
||||
if (first) {
|
||||
console.log('\nFirst dispensary keys:', Object.keys(first));
|
||||
console.log('First dispensary sample:', JSON.stringify(first, null, 2).slice(0, 1000));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log('\n❌ No __NEXT_DATA__ found');
|
||||
|
||||
// Check what scripts are on the page
|
||||
const scripts = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('script[id]')).map(s => ({
|
||||
id: s.id,
|
||||
src: (s as HTMLScriptElement).src?.slice(0, 100),
|
||||
}));
|
||||
});
|
||||
console.log('Scripts with IDs:', scripts);
|
||||
|
||||
// Try to find dispensary data in window object
|
||||
const windowData = await page.evaluate(() => {
|
||||
const w = window as any;
|
||||
const keys = ['__NEXT_DATA__', '__PRELOADED_STATE__', '__INITIAL_STATE__',
|
||||
'dispensaries', '__data', 'pageData', '__remixContext'];
|
||||
const found: Record<string, any> = {};
|
||||
for (const key of keys) {
|
||||
if (w[key]) {
|
||||
found[key] = typeof w[key] === 'object' ? Object.keys(w[key]) : typeof w[key];
|
||||
}
|
||||
}
|
||||
return found;
|
||||
});
|
||||
console.log('Window data:', windowData);
|
||||
|
||||
// Get some page content
|
||||
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500));
|
||||
console.log('\nPage text preview:', bodyText);
|
||||
}
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user