Files
cannaiq/backend/dist/scripts/platform-id-extractor.js
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

302 lines
11 KiB
JavaScript

"use strict";
/**
* Platform ID Extractor - Standalone script for extracting Dutchie platform IDs
*
* This script visits dispensary websites to capture their Dutchie retailerId
* by intercepting network requests to the Dutchie GraphQL API.
*
* It does NOT use the main orchestrator - it's a standalone browser-based tool.
*/
Object.defineProperty(exports, "__esModule", { value: true });
const playwright_1 = require("playwright");
const pg_1 = require("pg");
const pool = new pg_1.Pool({
connectionString: process.env.DATABASE_URL
});
async function extractPlatformId(browser, dispensary) {
let capturedId = null;
let captureSource = null;
let errorMsg = null;
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
});
const page = await context.newPage();
// Patterns to match retailer IDs in various formats
const idPatterns = [
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
/retailerId=([a-f0-9]{24})/i,
/\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path
];
// Intercept network requests
page.on('request', (request) => {
if (capturedId)
return;
const url = request.url();
if (url.includes('dutchie') || url.includes('api.dutchie')) {
// Check URL for retailer ID
for (const pattern of idPatterns) {
const match = url.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'request_url';
break;
}
}
// Check POST data
const postData = request.postData();
if (postData && !capturedId) {
for (const pattern of idPatterns) {
const match = postData.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'request_body';
break;
}
}
}
}
});
try {
console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`);
// Load main page
await page.goto(dispensary.website, {
waitUntil: 'domcontentloaded',
timeout: 25000
});
await page.waitForTimeout(2000);
// Check page content
if (!capturedId) {
const content = await page.content();
for (const pattern of idPatterns) {
const match = content.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'page_content';
break;
}
}
}
// Check __NEXT_DATA__
if (!capturedId) {
const nextData = await page.evaluate(() => {
const el = document.getElementById('__NEXT_DATA__');
return el?.textContent || null;
});
if (nextData) {
for (const pattern of idPatterns) {
const match = nextData.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = '__NEXT_DATA__';
break;
}
}
}
}
// Check iframes
if (!capturedId) {
const iframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of iframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'iframe_src';
break;
}
}
}
}
// Check scripts
if (!capturedId) {
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script'))
.map(s => s.src || s.innerHTML?.substring(0, 1000))
.filter(Boolean);
});
for (const script of scripts) {
if (script && (script.includes('dutchie') || script.includes('retailerId'))) {
for (const pattern of idPatterns) {
const match = script.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'script';
break;
}
}
if (capturedId)
break;
}
}
}
// Try navigating to menu/shop page
if (!capturedId) {
const menuLink = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a'));
for (const link of links) {
const href = link.href?.toLowerCase() || '';
const text = link.textContent?.toLowerCase() || '';
if (href.includes('menu') || href.includes('shop') || href.includes('order') ||
text.includes('menu') || text.includes('shop') || text.includes('order')) {
return link.href;
}
}
return null;
});
if (menuLink && !menuLink.startsWith('javascript:')) {
try {
console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`);
await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 });
await page.waitForTimeout(3000);
// Recheck all sources on new page
const newContent = await page.content();
for (const pattern of idPatterns) {
const match = newContent.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'menu_page_content';
break;
}
}
// Check iframes on new page
if (!capturedId) {
const newIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of newIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'menu_page_iframe';
break;
}
}
}
}
}
catch (navError) {
// Menu navigation failed, continue
}
}
}
// Final wait for async content
if (!capturedId) {
await page.waitForTimeout(3000);
// Final iframe check
const finalIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of finalIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'delayed_iframe';
break;
}
}
}
}
if (capturedId) {
console.log(` ✓ Found: ${capturedId} (${captureSource})`);
}
else {
console.log(` ✗ Not found`);
}
}
catch (e) {
errorMsg = e.message.substring(0, 100);
console.log(` ✗ Error: ${errorMsg}`);
}
finally {
await context.close();
}
return {
id: dispensary.id,
name: dispensary.name,
website: dispensary.website,
platformId: capturedId,
source: captureSource,
error: errorMsg
};
}
async function main() {
// Get specific dispensary ID from command line, or process all missing
const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null;
let query;
let params = [];
if (targetId) {
query = `
SELECT id, name, website
FROM dispensaries
WHERE id = $1
AND website IS NOT NULL AND website != ''
`;
params = [targetId];
}
else {
query = `
SELECT id, name, website
FROM dispensaries
WHERE state = 'AZ'
AND menu_type = 'dutchie'
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
AND website IS NOT NULL AND website != ''
ORDER BY name
`;
}
const result = await pool.query(query, params);
if (result.rows.length === 0) {
console.log('No dispensaries to process');
await pool.end();
return;
}
console.log(`\n=== Platform ID Extractor ===`);
console.log(`Processing ${result.rows.length} dispensaries...\n`);
const browser = await playwright_1.chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const results = [];
for (const dispensary of result.rows) {
const extractionResult = await extractPlatformId(browser, dispensary);
results.push(extractionResult);
// Update database immediately if found
if (extractionResult.platformId) {
await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [extractionResult.platformId, extractionResult.id]);
}
}
await browser.close();
// Summary
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log('='.repeat(60));
const found = results.filter(r => r.platformId);
const notFound = results.filter(r => !r.platformId);
console.log(`\nFound: ${found.length}/${results.length}`);
if (found.length > 0) {
console.log('\nSuccessful extractions:');
found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`));
}
if (notFound.length > 0) {
console.log(`\nNot found: ${notFound.length}`);
notFound.forEach(r => {
const reason = r.error || 'No Dutchie ID detected';
console.log(` [${r.id}] ${r.name}: ${reason}`);
});
}
await pool.end();
}
main().catch(e => {
console.error('Fatal error:', e);
process.exit(1);
});