Major changes: - Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API - Add migration 057 for crawl_enabled and dutchie_verified fields - Remove legacy dutchie-az module (replaced by platforms/dutchie) - Clean up deprecated crawlers, scrapers, and orchestrator code - Update location-discovery to not fallback to slug when ID is missing - Add crawl-rotator service for proxy rotation - Add types/index.ts for shared type definitions - Add woodpecker-agent k8s manifest Harmonization script: - Queries ConsumerDispensaries API for all 32 AZ cities - Matches dispensaries by platform_dispensary_id (not slug) - Updates existing records with full Dutchie data - Creates new records for unmatched Dutchie dispensaries - Disables dispensaries not found in Dutchie 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
115 lines
3.6 KiB
TypeScript
115 lines
3.6 KiB
TypeScript
/**
|
||
* Debug Dutchie city page to see what data is available
|
||
*/
|
||
|
||
import puppeteer from 'puppeteer-extra';
|
||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||
|
||
puppeteer.use(StealthPlugin());
|
||
|
||
async function main() {
|
||
const cityUrl = process.argv[2] || 'https://dutchie.com/us/dispensaries/wa-bellevue';
|
||
|
||
console.log(`Debugging page: ${cityUrl}`);
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: 'new',
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||
});
|
||
|
||
try {
|
||
const page = await browser.newPage();
|
||
await page.setUserAgent(
|
||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
);
|
||
|
||
console.log('Navigating...');
|
||
await page.goto(cityUrl, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000,
|
||
});
|
||
|
||
await new Promise((r) => setTimeout(r, 5000));
|
||
|
||
// Get page title
|
||
const title = await page.title();
|
||
console.log(`\nPage title: ${title}`);
|
||
|
||
// Check for Cloudflare challenge
|
||
const isCFChallenge = await page.evaluate(() => {
|
||
return document.title.includes('Just a moment') ||
|
||
document.body.textContent?.includes('Enable JavaScript');
|
||
});
|
||
|
||
if (isCFChallenge) {
|
||
console.log('\n⚠️ CLOUDFLARE CHALLENGE DETECTED - waiting longer...');
|
||
await new Promise((r) => setTimeout(r, 10000));
|
||
}
|
||
|
||
// Check for __NEXT_DATA__
|
||
const nextData = await page.evaluate(() => {
|
||
const script = document.querySelector('script#__NEXT_DATA__');
|
||
if (script) {
|
||
try {
|
||
return JSON.parse(script.textContent || '{}');
|
||
} catch {
|
||
return { error: 'Failed to parse __NEXT_DATA__' };
|
||
}
|
||
}
|
||
return null;
|
||
});
|
||
|
||
if (nextData) {
|
||
console.log('\n✅ __NEXT_DATA__ found!');
|
||
console.log('Keys:', Object.keys(nextData));
|
||
if (nextData.props?.pageProps) {
|
||
console.log('pageProps keys:', Object.keys(nextData.props.pageProps));
|
||
if (nextData.props.pageProps.dispensaries) {
|
||
console.log('Dispensaries count:', nextData.props.pageProps.dispensaries.length);
|
||
// Show first dispensary structure
|
||
const first = nextData.props.pageProps.dispensaries[0];
|
||
if (first) {
|
||
console.log('\nFirst dispensary keys:', Object.keys(first));
|
||
console.log('First dispensary sample:', JSON.stringify(first, null, 2).slice(0, 1000));
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
console.log('\n❌ No __NEXT_DATA__ found');
|
||
|
||
// Check what scripts are on the page
|
||
const scripts = await page.evaluate(() => {
|
||
return Array.from(document.querySelectorAll('script[id]')).map(s => ({
|
||
id: s.id,
|
||
src: (s as HTMLScriptElement).src?.slice(0, 100),
|
||
}));
|
||
});
|
||
console.log('Scripts with IDs:', scripts);
|
||
|
||
// Try to find dispensary data in window object
|
||
const windowData = await page.evaluate(() => {
|
||
const w = window as any;
|
||
const keys = ['__NEXT_DATA__', '__PRELOADED_STATE__', '__INITIAL_STATE__',
|
||
'dispensaries', '__data', 'pageData', '__remixContext'];
|
||
const found: Record<string, any> = {};
|
||
for (const key of keys) {
|
||
if (w[key]) {
|
||
found[key] = typeof w[key] === 'object' ? Object.keys(w[key]) : typeof w[key];
|
||
}
|
||
}
|
||
return found;
|
||
});
|
||
console.log('Window data:', windowData);
|
||
|
||
// Get some page content
|
||
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500));
|
||
console.log('\nPage text preview:', bodyText);
|
||
}
|
||
|
||
} finally {
|
||
await browser.close();
|
||
}
|
||
}
|
||
|
||
main().catch(console.error);
|