feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
72
backend/src/scripts/bootstrap-stores-for-dispensaries.ts
Normal file
72
backend/src/scripts/bootstrap-stores-for-dispensaries.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
/**
|
||||
* Creates `stores` table records for all dispensaries that:
|
||||
* 1. Have menu_type = 'dutchie' AND platform_dispensary_id (ready for GraphQL crawl)
|
||||
* 2. Don't already have a linked stores record
|
||||
*
|
||||
* The stores table is required by the scraper engine (scrapeStore function)
|
||||
*/
|
||||
async function bootstrapStores() {
|
||||
console.log('=== Bootstrapping stores for Dutchie dispensaries ===\n');
|
||||
|
||||
// Find all dutchie dispensaries without linked stores
|
||||
const result = await pool.query(`
|
||||
SELECT d.id, d.name, d.slug, d.menu_type, d.platform_dispensary_id, d.menu_url
|
||||
FROM dispensaries d
|
||||
LEFT JOIN stores s ON s.dispensary_id = d.id
|
||||
WHERE d.menu_type = 'dutchie'
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND s.id IS NULL
|
||||
ORDER BY d.id
|
||||
`);
|
||||
|
||||
console.log(`Found ${result.rows.length} dispensaries needing store records\n`);
|
||||
|
||||
let created = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const d of result.rows) {
|
||||
try {
|
||||
// Insert store record linking to dispensary
|
||||
// Note: stores table only has basic fields: name, slug, dispensary_id, dutchie_url
|
||||
// The platform_dispensary_id for GraphQL crawling lives in the dispensaries table
|
||||
const insertResult = await pool.query(`
|
||||
INSERT INTO stores (
|
||||
name,
|
||||
slug,
|
||||
dispensary_id,
|
||||
active,
|
||||
scrape_enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES ($1, $2, $3, true, true, NOW(), NOW())
|
||||
RETURNING id
|
||||
`, [
|
||||
d.name,
|
||||
d.slug || d.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'),
|
||||
d.id
|
||||
]);
|
||||
|
||||
console.log(`[CREATED] Store ${insertResult.rows[0].id} for dispensary ${d.id}: ${d.name}`);
|
||||
created++;
|
||||
} catch (e: any) {
|
||||
console.error(`[ERROR] Dispensary ${d.id} (${d.name}): ${e.message}`);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Bootstrap Summary ===');
|
||||
console.log(`Created: ${created}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
console.log(`Total needing stores: ${result.rows.length}`);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
bootstrapStores().catch(e => {
|
||||
console.error('Fatal error:', e.message);
|
||||
process.exit(1);
|
||||
});
|
||||
35
backend/src/scripts/check-store-linking.ts
Normal file
35
backend/src/scripts/check-store-linking.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function check() {
|
||||
// Check which dispensaries have linked stores
|
||||
const result = await pool.query(`
|
||||
SELECT d.id as disp_id, d.name, d.menu_type, d.platform_dispensary_id,
|
||||
s.id as store_id, s.name as store_name
|
||||
FROM dispensaries d
|
||||
LEFT JOIN stores s ON s.dispensary_id = d.id
|
||||
WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
|
||||
LIMIT 15
|
||||
`);
|
||||
|
||||
console.log('Dispensaries with linked stores:');
|
||||
result.rows.forEach(r => {
|
||||
console.log(` [${r.disp_id}] ${r.name} -> store ${r.store_id || 'NONE'} (${r.store_name || 'NOT LINKED'})`);
|
||||
});
|
||||
|
||||
// Count how many have linked stores
|
||||
const countResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE s.id IS NOT NULL) as with_store,
|
||||
COUNT(*) FILTER (WHERE s.id IS NULL) as without_store
|
||||
FROM dispensaries d
|
||||
LEFT JOIN stores s ON s.dispensary_id = d.id
|
||||
WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
|
||||
`);
|
||||
|
||||
console.log('\nSummary:', countResult.rows[0]);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
check();
|
||||
130
backend/src/scripts/detect-all.ts
Normal file
130
backend/src/scripts/detect-all.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
import { Pool } from 'pg';
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
// Simple fetch with timeout
|
||||
async function fetchWithTimeout(url: string, timeout = 10000): Promise<string> {
|
||||
const controller = new AbortController();
|
||||
const id = setTimeout(() => controller.abort(), timeout);
|
||||
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
clearTimeout(id);
|
||||
return await resp.text();
|
||||
} catch (e) {
|
||||
clearTimeout(id);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for dutchie patterns in HTML
|
||||
function detectDutchie(html: string): { provider: string; platformId?: string; menuUrl?: string } {
|
||||
// Check for reactEnv.dispensaryId (Curaleaf/Sol pattern)
|
||||
const reactEnvMatch = html.match(/"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i);
|
||||
if (reactEnvMatch) {
|
||||
return { provider: 'dutchie', platformId: reactEnvMatch[1] };
|
||||
}
|
||||
|
||||
// Check for Dutchie embedded-menu script (Trulieve pattern)
|
||||
// Look for: embedded-menu/5eaf48fc972e6200b1303b97.js
|
||||
const embedMatch = html.match(/embedded-menu\/([a-f0-9]{24})(?:\.js)?/i);
|
||||
if (embedMatch) {
|
||||
return { provider: 'dutchie', platformId: embedMatch[1] };
|
||||
}
|
||||
|
||||
// Check for dutchie.com links
|
||||
const dutchieLink = html.match(/https?:\/\/(?:www\.)?dutchie\.com\/(?:dispensary|embedded-menu|stores)\/([a-zA-Z0-9-]+)/i);
|
||||
if (dutchieLink) {
|
||||
return { provider: 'dutchie', menuUrl: dutchieLink[0] };
|
||||
}
|
||||
|
||||
// Check for jane
|
||||
if (html.includes('iheartjane.com') || html.includes('jane.co')) {
|
||||
const janeMatch = html.match(/https?:\/\/(?:www\.)?(?:iheartjane\.com|jane\.co)\/[^"\s]+/i);
|
||||
return { provider: 'jane', menuUrl: janeMatch?.[0] };
|
||||
}
|
||||
|
||||
// Check for treez
|
||||
if (html.includes('.treez.io')) {
|
||||
const treezMatch = html.match(/https?:\/\/[a-zA-Z0-9-]+\.treez\.io[^"\s]*/i);
|
||||
return { provider: 'treez', menuUrl: treezMatch?.[0] };
|
||||
}
|
||||
|
||||
// Check for leafly
|
||||
if (html.includes('leafly.com/dispensary')) {
|
||||
return { provider: 'leafly' };
|
||||
}
|
||||
|
||||
return { provider: 'unknown' };
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { rows: stores } = await pool.query(`
|
||||
SELECT id, name, website
|
||||
FROM dispensaries
|
||||
WHERE platform_dispensary_id IS NULL
|
||||
AND website IS NOT NULL
|
||||
AND website NOT LIKE '%example%'
|
||||
ORDER BY id
|
||||
LIMIT 150
|
||||
`);
|
||||
|
||||
console.log('Checking ' + stores.length + ' stores...\n');
|
||||
|
||||
let dutchieCount = 0;
|
||||
let otherCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const store of stores) {
|
||||
try {
|
||||
const html = await fetchWithTimeout(store.website);
|
||||
const result = detectDutchie(html);
|
||||
|
||||
if (result.provider === 'dutchie') {
|
||||
if (result.platformId) {
|
||||
await pool.query(
|
||||
'UPDATE dispensaries SET menu_type = $1, platform_dispensary_id = $2, updated_at = NOW() WHERE id = $3',
|
||||
['dutchie', result.platformId, store.id]
|
||||
);
|
||||
console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (ID: ' + result.platformId + ')');
|
||||
dutchieCount++;
|
||||
} else if (result.menuUrl) {
|
||||
await pool.query(
|
||||
'UPDATE dispensaries SET menu_type = $1, menu_url = $2, updated_at = NOW() WHERE id = $3',
|
||||
['dutchie', result.menuUrl, store.id]
|
||||
);
|
||||
console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (URL: ' + result.menuUrl.slice(0, 60) + ')');
|
||||
dutchieCount++;
|
||||
}
|
||||
} else if (result.provider !== 'unknown') {
|
||||
await pool.query(
|
||||
'UPDATE dispensaries SET menu_type = $1, menu_url = COALESCE($2, menu_url), updated_at = NOW() WHERE id = $3',
|
||||
[result.provider, result.menuUrl, store.id]
|
||||
);
|
||||
console.log('[' + store.id + '] ' + store.name + ' => ' + result.provider.toUpperCase());
|
||||
otherCount++;
|
||||
} else {
|
||||
console.log('[' + store.id + '] ' + store.name + ' => no menu found');
|
||||
}
|
||||
} catch (err: any) {
|
||||
const errMsg = err.name === 'AbortError' ? 'timeout' : err.message?.slice(0, 40) || 'error';
|
||||
console.log('[' + store.id + '] ' + store.name + ' => ERROR: ' + errMsg);
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Summary ===');
|
||||
console.log('Dutchie detected: ' + dutchieCount);
|
||||
console.log('Other providers: ' + otherCount);
|
||||
console.log('Errors: ' + errorCount);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
19
backend/src/scripts/export-dispensaries.ts
Normal file
19
backend/src/scripts/export-dispensaries.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
import { Pool } from 'pg';
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function exportDispensaries() {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT id, name, dba_name, company_name, slug,
|
||||
address, city, state, zip, latitude, longitude,
|
||||
website, menu_type, menu_url, platform_dispensary_id,
|
||||
created_at, updated_at
|
||||
FROM dispensaries
|
||||
WHERE menu_type IS NOT NULL
|
||||
ORDER BY id
|
||||
`);
|
||||
|
||||
console.log(JSON.stringify(rows, null, 2));
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
exportDispensaries();
|
||||
278
backend/src/scripts/extract-platform-ids.ts
Normal file
278
backend/src/scripts/extract-platform-ids.ts
Normal file
@@ -0,0 +1,278 @@
|
||||
import { chromium } from 'playwright';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL
|
||||
});
|
||||
|
||||
interface Dispensary {
|
||||
id: number;
|
||||
name: string;
|
||||
website: string;
|
||||
}
|
||||
|
||||
async function extractPlatformId(browser: any, dispensary: Dispensary): Promise<string | null> {
|
||||
let capturedId: string | null = null;
|
||||
const context = await browser.newContext();
|
||||
const page = await context.newPage();
|
||||
|
||||
// Intercept network requests to find retailer IDs
|
||||
page.on('request', (request: any) => {
|
||||
const url = request.url();
|
||||
if (url.includes('dutchie') || url.includes('plus.dutchie') || url.includes('api.dutchie')) {
|
||||
// Check URL for retailer ID
|
||||
const urlMatch = url.match(/[\/=]([a-f0-9]{24})(?:[\/\?&]|$)/i);
|
||||
if (urlMatch && !capturedId) {
|
||||
capturedId = urlMatch[1];
|
||||
console.log(` Captured from URL: ${capturedId}`);
|
||||
}
|
||||
|
||||
const postData = request.postData();
|
||||
if (postData) {
|
||||
// Look for retailerId in GraphQL variables
|
||||
const match = postData.match(/["']?retailerId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
|
||||
if (match && !capturedId) {
|
||||
capturedId = match[1];
|
||||
console.log(` Captured retailerId: ${capturedId}`);
|
||||
}
|
||||
// Also look for dispensaryId
|
||||
const dispMatch = postData.match(/["']?dispensaryId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
|
||||
if (dispMatch && !capturedId) {
|
||||
capturedId = dispMatch[1];
|
||||
console.log(` Captured dispensaryId: ${capturedId}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`\nLoading ${dispensary.name}: ${dispensary.website}`);
|
||||
await page.goto(dispensary.website, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
|
||||
// Wait for initial load
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// Check page content for retailerId
|
||||
const content = await page.content();
|
||||
|
||||
// Try various patterns in page content
|
||||
const patterns = [
|
||||
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
|
||||
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
|
||||
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
|
||||
/retailerId=([a-f0-9]{24})/i,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = content.match(pattern);
|
||||
if (match && !capturedId) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in content: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check __NEXT_DATA__ if present
|
||||
if (!capturedId) {
|
||||
const nextData = await page.evaluate(() => {
|
||||
const el = document.getElementById('__NEXT_DATA__');
|
||||
return el?.textContent || null;
|
||||
});
|
||||
|
||||
if (nextData) {
|
||||
for (const pattern of patterns) {
|
||||
const match = nextData.match(pattern);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in __NEXT_DATA__: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Look for iframes that might contain dutchie embed
|
||||
if (!capturedId) {
|
||||
const iframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
|
||||
for (const src of iframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in iframe: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If still not found, try clicking on "Shop" or "Menu" links
|
||||
if (!capturedId) {
|
||||
const menuSelectors = [
|
||||
'a:has-text("Shop")',
|
||||
'a:has-text("Menu")',
|
||||
'a:has-text("Order")',
|
||||
'a[href*="menu"]',
|
||||
'a[href*="shop"]',
|
||||
'a[href*="order"]',
|
||||
'button:has-text("Shop")',
|
||||
'button:has-text("Menu")',
|
||||
];
|
||||
|
||||
for (const selector of menuSelectors) {
|
||||
try {
|
||||
const element = page.locator(selector).first();
|
||||
const isVisible = await element.isVisible({ timeout: 500 });
|
||||
if (isVisible) {
|
||||
const href = await element.getAttribute('href');
|
||||
// If it's an internal link, click it
|
||||
if (href && !href.startsWith('http')) {
|
||||
console.log(` Clicking ${selector}...`);
|
||||
await element.click();
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Check new page content
|
||||
const newContent = await page.content();
|
||||
for (const pattern of patterns) {
|
||||
const match = newContent.match(pattern);
|
||||
if (match && !capturedId) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found after navigation: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check iframes on new page
|
||||
if (!capturedId) {
|
||||
const newIframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
for (const src of newIframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in iframe after nav: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (capturedId) break;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Continue to next selector
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If still not found, wait longer for async dutchie widget to load
|
||||
if (!capturedId) {
|
||||
console.log(` Waiting for async content...`);
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Check for dutchie script tags
|
||||
const scripts = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('script')).map(s => s.src || s.innerHTML?.substring(0, 500));
|
||||
});
|
||||
|
||||
for (const script of scripts) {
|
||||
if (script && script.includes('dutchie')) {
|
||||
for (const pattern of patterns) {
|
||||
const match = script.match(pattern);
|
||||
if (match && !capturedId) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in script: ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (capturedId) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Final check of iframes after wait
|
||||
if (!capturedId) {
|
||||
const finalIframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
for (const src of finalIframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
console.log(` Found in iframe (delayed): ${capturedId}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (e: any) {
|
||||
console.log(` Error: ${e.message.substring(0, 80)}`);
|
||||
} finally {
|
||||
await context.close();
|
||||
}
|
||||
|
||||
return capturedId;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// Get dispensaries missing platform IDs
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
AND menu_type = 'dutchie'
|
||||
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
|
||||
AND website IS NOT NULL AND website != ''
|
||||
ORDER BY name
|
||||
`);
|
||||
|
||||
console.log(`Found ${result.rows.length} dispensaries to process\n`);
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
|
||||
const results: { id: number; name: string; platformId: string | null }[] = [];
|
||||
|
||||
for (const dispensary of result.rows) {
|
||||
const platformId = await extractPlatformId(browser, dispensary);
|
||||
results.push({ id: dispensary.id, name: dispensary.name, platformId });
|
||||
|
||||
if (platformId) {
|
||||
// Update database
|
||||
await pool.query(
|
||||
'UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2',
|
||||
[platformId, dispensary.id]
|
||||
);
|
||||
console.log(` Updated database with ${platformId}`);
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
console.log('\n=== SUMMARY ===');
|
||||
const found = results.filter(r => r.platformId);
|
||||
const notFound = results.filter(r => !r.platformId);
|
||||
|
||||
console.log(`\nFound (${found.length}):`);
|
||||
found.forEach(r => console.log(` ${r.id}: ${r.name} -> ${r.platformId}`));
|
||||
|
||||
console.log(`\nNot Found (${notFound.length}):`);
|
||||
notFound.forEach(r => console.log(` ${r.id}: ${r.name}`));
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error('Error:', e);
|
||||
process.exit(1);
|
||||
});
|
||||
83
backend/src/scripts/import-dispensaries.ts
Normal file
83
backend/src/scripts/import-dispensaries.ts
Normal file
@@ -0,0 +1,83 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as fs from 'fs';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function importDispensaries(filePath: string) {
|
||||
const data = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
||||
|
||||
console.log(`Importing ${data.length} dispensaries...`);
|
||||
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const d of data) {
|
||||
try {
|
||||
// Check if dispensary exists by name and city
|
||||
const { rows: existing } = await pool.query(
|
||||
`SELECT id FROM dispensaries WHERE name = $1 AND city = $2`,
|
||||
[d.name, d.city]
|
||||
);
|
||||
|
||||
if (existing.length > 0) {
|
||||
// Update existing
|
||||
await pool.query(`
|
||||
UPDATE dispensaries SET
|
||||
dba_name = COALESCE($1, dba_name),
|
||||
company_name = COALESCE($2, company_name),
|
||||
slug = COALESCE($3, slug),
|
||||
address = COALESCE($4, address),
|
||||
state = COALESCE($5, state),
|
||||
zip = COALESCE($6, zip),
|
||||
latitude = COALESCE($7, latitude),
|
||||
longitude = COALESCE($8, longitude),
|
||||
website = COALESCE($9, website),
|
||||
menu_type = COALESCE($10, menu_type),
|
||||
menu_url = COALESCE($11, menu_url),
|
||||
platform_dispensary_id = COALESCE($12, platform_dispensary_id),
|
||||
updated_at = NOW()
|
||||
WHERE id = $13
|
||||
`, [
|
||||
d.dba_name, d.company_name, d.slug,
|
||||
d.address, d.state, d.zip,
|
||||
d.latitude, d.longitude, d.website,
|
||||
d.menu_type, d.menu_url, d.platform_dispensary_id,
|
||||
existing[0].id
|
||||
]);
|
||||
console.log(`Updated: [${existing[0].id}] ${d.name} (${d.city})`);
|
||||
updated++;
|
||||
} else {
|
||||
// Insert new
|
||||
const { rows: newRow } = await pool.query(`
|
||||
INSERT INTO dispensaries (
|
||||
name, dba_name, company_name, slug,
|
||||
address, city, state, zip, latitude, longitude,
|
||||
website, menu_type, menu_url, platform_dispensary_id,
|
||||
created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW())
|
||||
RETURNING id
|
||||
`, [
|
||||
d.name, d.dba_name, d.company_name, d.slug,
|
||||
d.address, d.city, d.state, d.zip, d.latitude, d.longitude,
|
||||
d.website, d.menu_type, d.menu_url, d.platform_dispensary_id
|
||||
]);
|
||||
console.log(`Inserted: [${newRow[0].id}] ${d.name} (${d.city})`);
|
||||
inserted++;
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(`Error for ${d.name}: ${err.message}`);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n=== Import Summary ===`);
|
||||
console.log(`Inserted: ${inserted}`);
|
||||
console.log(`Updated: ${updated}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
const filePath = process.argv[2] || '/tmp/dispensaries-export.json';
|
||||
importDispensaries(filePath).catch(console.error);
|
||||
133
backend/src/scripts/jars-az-extractor.ts
Normal file
133
backend/src/scripts/jars-az-extractor.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import { chromium } from 'playwright';
|
||||
|
||||
async function extractJarsAzStoreIds() {
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const page = await browser.newPage();
|
||||
|
||||
const results: { name: string; retailerId: string; url: string }[] = [];
|
||||
const capturedIds: string[] = [];
|
||||
const allRequests: string[] = [];
|
||||
|
||||
// Intercept network requests to find Dutchie Plus API calls
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
allRequests.push(url.substring(0, 100));
|
||||
|
||||
if (url.includes('dutchie') || url.includes('graphql')) {
|
||||
const postData = request.postData();
|
||||
console.log('Dutchie request to:', url.substring(0, 80));
|
||||
if (postData) {
|
||||
// Look for retailerId in GraphQL variables
|
||||
const match = postData.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/i);
|
||||
if (match) {
|
||||
const id = match[1];
|
||||
if (capturedIds.indexOf(id) === -1) {
|
||||
capturedIds.push(id);
|
||||
console.log('Captured retailerId from request:', id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
// Just load one page first and thoroughly debug it
|
||||
console.log('Loading Mesa store with full network debugging...');
|
||||
await page.goto('https://jarscannabis.com/shop/mesa-az/', {
|
||||
waitUntil: 'networkidle',
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
console.log('\nWaiting 5 seconds for dynamic content...');
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Get page title and content
|
||||
const title = await page.title();
|
||||
console.log('Page title:', title);
|
||||
|
||||
const content = await page.content();
|
||||
console.log('Page content length:', content.length);
|
||||
|
||||
// Save screenshot
|
||||
await page.screenshot({ path: '/tmp/jars-mesa-debug.png', fullPage: true });
|
||||
console.log('Screenshot saved to /tmp/jars-mesa-debug.png');
|
||||
|
||||
// Look for all UUIDs in content
|
||||
const uuidPattern = /[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi;
|
||||
const uuids = content.match(uuidPattern);
|
||||
if (uuids) {
|
||||
const uniqueUuids = [...new Set(uuids)];
|
||||
console.log('\n=== All UUIDs found on page ===');
|
||||
uniqueUuids.forEach(u => console.log(u));
|
||||
}
|
||||
|
||||
// Look for all iframes
|
||||
const iframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => ({
|
||||
src: f.src,
|
||||
id: f.id,
|
||||
name: f.name,
|
||||
className: f.className
|
||||
}));
|
||||
});
|
||||
console.log('\n=== Iframes ===');
|
||||
console.log(JSON.stringify(iframes, null, 2));
|
||||
|
||||
// Look for any elements with dutchie
|
||||
const dutchieElements = await page.evaluate(() => {
|
||||
const elements = document.body.innerHTML.match(/dutchie[^<>]*\"/gi) || [];
|
||||
return elements.slice(0, 20);
|
||||
});
|
||||
console.log('\n=== Dutchie mentions ===');
|
||||
dutchieElements.forEach(e => console.log(e));
|
||||
|
||||
// Look for script src containing dutchie
|
||||
const scripts = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('script[src]'))
|
||||
.map(s => s.getAttribute('src'))
|
||||
.filter(src => src && (src.includes('dutchie') || src.includes('embed')));
|
||||
});
|
||||
console.log('\n=== Relevant scripts ===');
|
||||
scripts.forEach(s => console.log(s));
|
||||
|
||||
// Look for __NEXT_DATA__
|
||||
const nextData = await page.evaluate(() => {
|
||||
const el = document.getElementById('__NEXT_DATA__');
|
||||
return el ? el.textContent : null;
|
||||
});
|
||||
if (nextData) {
|
||||
console.log('\n=== __NEXT_DATA__ found ===');
|
||||
const data = JSON.parse(nextData);
|
||||
// Look for retailer in various places
|
||||
const propsStr = JSON.stringify(data, null, 2);
|
||||
// Find all UUID patterns in the props
|
||||
const propsUuids = propsStr.match(/[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi);
|
||||
if (propsUuids) {
|
||||
console.log('UUIDs in __NEXT_DATA__:', [...new Set(propsUuids)]);
|
||||
}
|
||||
} else {
|
||||
console.log('\nNo __NEXT_DATA__ found');
|
||||
}
|
||||
|
||||
// Look for specific Dutchie embed patterns
|
||||
const embedPatterns = content.match(/https:\/\/[^"'\s]*dutchie[^"'\s]*/gi);
|
||||
if (embedPatterns) {
|
||||
console.log('\n=== Dutchie embed URLs ===');
|
||||
[...new Set(embedPatterns)].forEach(u => console.log(u));
|
||||
}
|
||||
|
||||
console.log('\n=== Network requests summary ===');
|
||||
console.log('Total requests:', allRequests.length);
|
||||
const dutchieRequests = allRequests.filter(r => r.includes('dutchie'));
|
||||
console.log('Dutchie requests:', dutchieRequests.length);
|
||||
dutchieRequests.forEach(r => console.log(r));
|
||||
|
||||
console.log('\n=== CAPTURED IDS ===');
|
||||
console.log(capturedIds);
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
extractJarsAzStoreIds().catch(e => console.error('Error:', e.message));
|
||||
197
backend/src/scripts/jars-az-finder.ts
Normal file
197
backend/src/scripts/jars-az-finder.ts
Normal file
@@ -0,0 +1,197 @@
|
||||
import { chromium } from 'playwright';
|
||||
|
||||
async function findJarsAzStores() {
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const page = await browser.newPage();
|
||||
|
||||
const capturedRetailerIds: { url: string; retailerId: string }[] = [];
|
||||
const allApiCalls: string[] = [];
|
||||
|
||||
// Intercept ALL requests to find retailer IDs
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
|
||||
// Log Buddy API calls
|
||||
if (url.includes('buddyapi') || url.includes('dutchie') || url.includes('graphql')) {
|
||||
allApiCalls.push(url);
|
||||
const postData = request.postData();
|
||||
if (postData) {
|
||||
// Look for retailerId in various formats
|
||||
const match = postData.match(/retailerId['":\s]+([a-f0-9-]{36})/i);
|
||||
if (match) {
|
||||
capturedRetailerIds.push({ url, retailerId: match[1] });
|
||||
}
|
||||
}
|
||||
// Also check URL params
|
||||
const urlMatch = url.match(/retailerId=([a-f0-9-]{36})/i);
|
||||
if (urlMatch) {
|
||||
capturedRetailerIds.push({ url, retailerId: urlMatch[1] });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
// First, let's try to find the actual Arizona menu URLs
|
||||
console.log('Loading JARS find-a-dispensary page...');
|
||||
await page.goto('https://jarscannabis.com/find-a-dispensary', {
|
||||
waitUntil: 'networkidle',
|
||||
timeout: 30000
|
||||
});
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Take screenshot
|
||||
await page.screenshot({ path: '/tmp/jars-find-dispensary.png', fullPage: true });
|
||||
console.log('Screenshot saved to /tmp/jars-find-dispensary.png');
|
||||
|
||||
// Try to find state selector and click Arizona
|
||||
console.log('\nLooking for state selector...');
|
||||
|
||||
// Try various ways to select Arizona
|
||||
const stateSelectors = [
|
||||
'select[name*="state"]',
|
||||
'[class*="state"] select',
|
||||
'select option[value="AZ"]',
|
||||
'button:has-text("Arizona")',
|
||||
'a:has-text("Arizona")',
|
||||
'[data-state="AZ"]',
|
||||
'div:has-text("Arizona")',
|
||||
];
|
||||
|
||||
for (const selector of stateSelectors) {
|
||||
try {
|
||||
const element = page.locator(selector).first();
|
||||
const isVisible = await element.isVisible({ timeout: 1000 });
|
||||
if (isVisible) {
|
||||
console.log(`Found element with selector: ${selector}`);
|
||||
await element.click();
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
} catch (e) {
|
||||
// Continue to next selector
|
||||
}
|
||||
}
|
||||
|
||||
// Get all links on the page
|
||||
const links = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('a')).map(a => ({
|
||||
href: a.href,
|
||||
text: a.textContent?.trim()
|
||||
})).filter(l => l.href.includes('/shop') || l.href.includes('menu') || l.href.includes('arizona') || l.href.includes('-az'));
|
||||
});
|
||||
|
||||
console.log('\n=== Shop/Menu Links Found ===');
|
||||
links.forEach(l => console.log(`${l.text}: ${l.href}`));
|
||||
|
||||
// Look for __NEXT_DATA__ which might have location data
|
||||
const nextData = await page.evaluate(() => {
|
||||
const el = document.getElementById('__NEXT_DATA__');
|
||||
return el?.textContent || null;
|
||||
});
|
||||
|
||||
if (nextData) {
|
||||
console.log('\n=== Analyzing __NEXT_DATA__ ===');
|
||||
const data = JSON.parse(nextData);
|
||||
const dataStr = JSON.stringify(data);
|
||||
|
||||
// Look for Arizona references
|
||||
if (dataStr.includes('Arizona') || dataStr.includes('AZ')) {
|
||||
console.log('Found Arizona references in __NEXT_DATA__');
|
||||
|
||||
// Extract all objects that might be Arizona stores
|
||||
const findArizonaStores = (obj: any, path: string = ''): any[] => {
|
||||
const results: any[] = [];
|
||||
if (!obj || typeof obj !== 'object') return results;
|
||||
|
||||
if (Array.isArray(obj)) {
|
||||
obj.forEach((item, i) => {
|
||||
results.push(...findArizonaStores(item, `${path}[${i}]`));
|
||||
});
|
||||
} else {
|
||||
// Check if this object looks like an AZ store
|
||||
if (obj.state === 'AZ' || obj.state === 'Arizona' ||
|
||||
obj.stateCode === 'AZ' || obj.region === 'Arizona' ||
|
||||
(obj.city && ['Mesa', 'Phoenix', 'Peoria', 'Payson', 'Globe', 'Safford', 'Somerton', 'Prescott Valley'].includes(obj.city))) {
|
||||
results.push({ path, data: obj });
|
||||
}
|
||||
|
||||
for (const key of Object.keys(obj)) {
|
||||
results.push(...findArizonaStores(obj[key], `${path}.${key}`));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
};
|
||||
|
||||
const azStores = findArizonaStores(data);
|
||||
console.log(`Found ${azStores.length} Arizona store objects`);
|
||||
azStores.forEach(s => {
|
||||
console.log('\n---');
|
||||
console.log('Path:', s.path);
|
||||
console.log(JSON.stringify(s.data, null, 2));
|
||||
});
|
||||
}
|
||||
|
||||
// Also look for retailer IDs
|
||||
const retailerMatches = dataStr.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/gi);
|
||||
if (retailerMatches) {
|
||||
console.log('\n=== RetailerIds in __NEXT_DATA__ ===');
|
||||
const uniqueIds = [...new Set(retailerMatches.map(m => {
|
||||
const match = m.match(/([a-f0-9-]{36})/i);
|
||||
return match ? match[1] : null;
|
||||
}).filter(Boolean))];
|
||||
uniqueIds.forEach(id => console.log(id));
|
||||
}
|
||||
}
|
||||
|
||||
// Try loading a known store URL pattern
|
||||
const testUrls = [
|
||||
'https://jarscannabis.com/arizona/',
|
||||
'https://jarscannabis.com/az/',
|
||||
'https://jarscannabis.com/stores/arizona/',
|
||||
'https://jarscannabis.com/locations/arizona/',
|
||||
'https://jarscannabis.com/shop/arizona/',
|
||||
'https://az.jarscannabis.com/',
|
||||
];
|
||||
|
||||
console.log('\n=== Testing Arizona URLs ===');
|
||||
for (const testUrl of testUrls) {
|
||||
try {
|
||||
const response = await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
const status = response?.status();
|
||||
console.log(`${testUrl}: ${status}`);
|
||||
if (status === 200) {
|
||||
const title = await page.title();
|
||||
console.log(` Title: ${title}`);
|
||||
|
||||
// If we found a working page, extract store links
|
||||
const storeLinks = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('a')).map(a => ({
|
||||
href: a.href,
|
||||
text: a.textContent?.trim()
|
||||
})).filter(l => l.href.includes('shop') || l.href.includes('menu'));
|
||||
});
|
||||
|
||||
if (storeLinks.length > 0) {
|
||||
console.log(' Store links:');
|
||||
storeLinks.forEach(l => console.log(` ${l.text}: ${l.href}`));
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(`${testUrl}: Error - ${(e as Error).message.substring(0, 50)}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Captured Retailer IDs from API calls ===');
|
||||
const uniqueRetailerIds = [...new Map(capturedRetailerIds.map(r => [r.retailerId, r])).values()];
|
||||
uniqueRetailerIds.forEach(r => {
|
||||
console.log(`${r.retailerId} (from: ${r.url.substring(0, 60)}...)`);
|
||||
});
|
||||
|
||||
console.log('\n=== All API calls ===');
|
||||
allApiCalls.forEach(url => console.log(url.substring(0, 100)));
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
findJarsAzStores().catch(e => console.error('Error:', e.message));
|
||||
350
backend/src/scripts/platform-id-extractor.ts
Normal file
350
backend/src/scripts/platform-id-extractor.ts
Normal file
@@ -0,0 +1,350 @@
|
||||
/**
|
||||
* Platform ID Extractor - Standalone script for extracting Dutchie platform IDs
|
||||
*
|
||||
* This script visits dispensary websites to capture their Dutchie retailerId
|
||||
* by intercepting network requests to the Dutchie GraphQL API.
|
||||
*
|
||||
* It does NOT use the main orchestrator - it's a standalone browser-based tool.
|
||||
*/
|
||||
|
||||
import { chromium, Browser, BrowserContext, Page } from 'playwright';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL
|
||||
});
|
||||
|
||||
interface Dispensary {
|
||||
id: number;
|
||||
name: string;
|
||||
website: string;
|
||||
}
|
||||
|
||||
interface ExtractionResult {
|
||||
id: number;
|
||||
name: string;
|
||||
website: string;
|
||||
platformId: string | null;
|
||||
source: string | null;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
async function extractPlatformId(browser: Browser, dispensary: Dispensary): Promise<ExtractionResult> {
|
||||
let capturedId: string | null = null;
|
||||
let captureSource: string | null = null;
|
||||
let errorMsg: string | null = null;
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
// Patterns to match retailer IDs in various formats
|
||||
const idPatterns = [
|
||||
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
|
||||
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
|
||||
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
|
||||
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
|
||||
/retailerId=([a-f0-9]{24})/i,
|
||||
/\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path
|
||||
];
|
||||
|
||||
// Intercept network requests
|
||||
page.on('request', (request) => {
|
||||
if (capturedId) return;
|
||||
|
||||
const url = request.url();
|
||||
if (url.includes('dutchie') || url.includes('api.dutchie')) {
|
||||
// Check URL for retailer ID
|
||||
for (const pattern of idPatterns) {
|
||||
const match = url.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'request_url';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check POST data
|
||||
const postData = request.postData();
|
||||
if (postData && !capturedId) {
|
||||
for (const pattern of idPatterns) {
|
||||
const match = postData.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'request_body';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`);
|
||||
|
||||
// Load main page
|
||||
await page.goto(dispensary.website, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 25000
|
||||
});
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// Check page content
|
||||
if (!capturedId) {
|
||||
const content = await page.content();
|
||||
for (const pattern of idPatterns) {
|
||||
const match = content.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'page_content';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check __NEXT_DATA__
|
||||
if (!capturedId) {
|
||||
const nextData = await page.evaluate(() => {
|
||||
const el = document.getElementById('__NEXT_DATA__');
|
||||
return el?.textContent || null;
|
||||
});
|
||||
if (nextData) {
|
||||
for (const pattern of idPatterns) {
|
||||
const match = nextData.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = '__NEXT_DATA__';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check iframes
|
||||
if (!capturedId) {
|
||||
const iframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
for (const src of iframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'iframe_src';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check scripts
|
||||
if (!capturedId) {
|
||||
const scripts = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('script'))
|
||||
.map(s => s.src || s.innerHTML?.substring(0, 1000))
|
||||
.filter(Boolean);
|
||||
});
|
||||
for (const script of scripts) {
|
||||
if (script && (script.includes('dutchie') || script.includes('retailerId'))) {
|
||||
for (const pattern of idPatterns) {
|
||||
const match = script.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'script';
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (capturedId) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try navigating to menu/shop page
|
||||
if (!capturedId) {
|
||||
const menuLink = await page.evaluate(() => {
|
||||
const links = Array.from(document.querySelectorAll('a'));
|
||||
for (const link of links) {
|
||||
const href = link.href?.toLowerCase() || '';
|
||||
const text = link.textContent?.toLowerCase() || '';
|
||||
if (href.includes('menu') || href.includes('shop') || href.includes('order') ||
|
||||
text.includes('menu') || text.includes('shop') || text.includes('order')) {
|
||||
return link.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (menuLink && !menuLink.startsWith('javascript:')) {
|
||||
try {
|
||||
console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`);
|
||||
await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Recheck all sources on new page
|
||||
const newContent = await page.content();
|
||||
for (const pattern of idPatterns) {
|
||||
const match = newContent.match(pattern);
|
||||
if (match && match[1] && match[1].length === 24) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'menu_page_content';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check iframes on new page
|
||||
if (!capturedId) {
|
||||
const newIframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
for (const src of newIframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'menu_page_iframe';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (navError: any) {
|
||||
// Menu navigation failed, continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final wait for async content
|
||||
if (!capturedId) {
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Final iframe check
|
||||
const finalIframes = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
||||
});
|
||||
for (const src of finalIframes) {
|
||||
if (src.includes('dutchie')) {
|
||||
const match = src.match(/([a-f0-9]{24})/i);
|
||||
if (match) {
|
||||
capturedId = match[1];
|
||||
captureSource = 'delayed_iframe';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (capturedId) {
|
||||
console.log(` ✓ Found: ${capturedId} (${captureSource})`);
|
||||
} else {
|
||||
console.log(` ✗ Not found`);
|
||||
}
|
||||
|
||||
} catch (e: any) {
|
||||
errorMsg = e.message.substring(0, 100);
|
||||
console.log(` ✗ Error: ${errorMsg}`);
|
||||
} finally {
|
||||
await context.close();
|
||||
}
|
||||
|
||||
return {
|
||||
id: dispensary.id,
|
||||
name: dispensary.name,
|
||||
website: dispensary.website,
|
||||
platformId: capturedId,
|
||||
source: captureSource,
|
||||
error: errorMsg
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// Get specific dispensary ID from command line, or process all missing
|
||||
const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null;
|
||||
|
||||
let query: string;
|
||||
let params: any[] = [];
|
||||
|
||||
if (targetId) {
|
||||
query = `
|
||||
SELECT id, name, website
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
AND website IS NOT NULL AND website != ''
|
||||
`;
|
||||
params = [targetId];
|
||||
} else {
|
||||
query = `
|
||||
SELECT id, name, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
AND menu_type = 'dutchie'
|
||||
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
|
||||
AND website IS NOT NULL AND website != ''
|
||||
ORDER BY name
|
||||
`;
|
||||
}
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
console.log('No dispensaries to process');
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n=== Platform ID Extractor ===`);
|
||||
console.log(`Processing ${result.rows.length} dispensaries...\n`);
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
|
||||
const results: ExtractionResult[] = [];
|
||||
|
||||
for (const dispensary of result.rows) {
|
||||
const extractionResult = await extractPlatformId(browser, dispensary);
|
||||
results.push(extractionResult);
|
||||
|
||||
// Update database immediately if found
|
||||
if (extractionResult.platformId) {
|
||||
await pool.query(
|
||||
'UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2',
|
||||
[extractionResult.platformId, extractionResult.id]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Summary
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const found = results.filter(r => r.platformId);
|
||||
const notFound = results.filter(r => !r.platformId);
|
||||
|
||||
console.log(`\nFound: ${found.length}/${results.length}`);
|
||||
if (found.length > 0) {
|
||||
console.log('\nSuccessful extractions:');
|
||||
found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`));
|
||||
}
|
||||
|
||||
if (notFound.length > 0) {
|
||||
console.log(`\nNot found: ${notFound.length}`);
|
||||
notFound.forEach(r => {
|
||||
const reason = r.error || 'No Dutchie ID detected';
|
||||
console.log(` [${r.id}] ${r.name}: ${reason}`);
|
||||
});
|
||||
}
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error('Fatal error:', e);
|
||||
process.exit(1);
|
||||
});
|
||||
301
backend/src/scripts/test-jane-scraper.ts
Normal file
301
backend/src/scripts/test-jane-scraper.ts
Normal file
@@ -0,0 +1,301 @@
|
||||
/**
|
||||
* Test script for iHeartJane menu scraping via Playwright
|
||||
* Intercepts API/Algolia calls made by the browser
|
||||
*/
|
||||
|
||||
import { chromium } from 'playwright';
|
||||
|
||||
interface JaneProduct {
|
||||
id: number;
|
||||
name: string;
|
||||
brand?: string;
|
||||
category?: string;
|
||||
kind?: string;
|
||||
kind_subtype?: string;
|
||||
price?: number;
|
||||
prices?: Record<string, number>;
|
||||
thc_potency?: number;
|
||||
cbd_potency?: number;
|
||||
image_url?: string;
|
||||
description?: string;
|
||||
store_id?: number;
|
||||
}
|
||||
|
||||
async function scrapeJaneMenu(urlOrStoreId: string) {
|
||||
// Handle either a full URL or just a store ID
|
||||
const menuUrl = urlOrStoreId.startsWith('http')
|
||||
? urlOrStoreId
|
||||
: `https://www.iheartjane.com/embed/stores/${urlOrStoreId}/menu`;
|
||||
|
||||
console.log(`Starting Playwright scrape for iHeartJane: ${menuUrl}`);
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
]
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
locale: 'en-US',
|
||||
timezoneId: 'America/Chicago'
|
||||
});
|
||||
|
||||
// Add stealth scripts to avoid detection
|
||||
await context.addInitScript(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const products: JaneProduct[] = [];
|
||||
const apiResponses: any[] = [];
|
||||
const capturedCredentials: any = {};
|
||||
|
||||
// Intercept ALL network requests to capture API/Algolia data and credentials
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
const headers = request.headers();
|
||||
|
||||
// Capture Algolia credentials from request headers
|
||||
if (url.includes('algolia')) {
|
||||
const appId = headers['x-algolia-application-id'];
|
||||
const apiKey = headers['x-algolia-api-key'];
|
||||
if (appId && apiKey) {
|
||||
capturedCredentials.algolia = { appId, apiKey };
|
||||
console.log(`Captured Algolia credentials: App=${appId}, Key=${apiKey.substring(0, 10)}...`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
|
||||
// Capture Algolia search results
|
||||
if (url.includes('algolia.net') || url.includes('algolianet.com')) {
|
||||
try {
|
||||
const data = await response.json();
|
||||
if (data.results && data.results[0] && data.results[0].hits) {
|
||||
console.log(`Captured ${data.results[0].hits.length} products from Algolia`);
|
||||
apiResponses.push({ type: 'algolia', data: data.results[0] });
|
||||
}
|
||||
} catch (e) {
|
||||
// Not JSON or error parsing
|
||||
}
|
||||
}
|
||||
|
||||
// Capture Jane API responses
|
||||
if (url.includes('api.iheartjane.com') && url.includes('products')) {
|
||||
try {
|
||||
const data = await response.json();
|
||||
console.log(`Captured Jane API response: ${url}`);
|
||||
apiResponses.push({ type: 'jane-api', url, data });
|
||||
} catch (e) {
|
||||
// Not JSON or error parsing
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`Navigating to: ${menuUrl}`);
|
||||
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
// Wait for page to settle
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// Handle age gate - use Playwright locator with force click
|
||||
console.log('Looking for age gate...');
|
||||
try {
|
||||
let clicked = false;
|
||||
|
||||
// Method 1: Use Playwright locator with exact text match
|
||||
try {
|
||||
const yesButton = page.locator('button:has-text("Yes")').first();
|
||||
await yesButton.waitFor({ state: 'visible', timeout: 5000 });
|
||||
await yesButton.click({ force: true });
|
||||
clicked = true;
|
||||
console.log('Clicked age gate via Playwright locator');
|
||||
await page.waitForTimeout(5000);
|
||||
} catch (e) {
|
||||
console.log('Playwright locator failed:', (e as Error).message);
|
||||
}
|
||||
|
||||
// Method 2: Try clicking by visible bounding box
|
||||
if (!clicked) {
|
||||
try {
|
||||
const box = await page.locator('button:has-text("Yes")').first().boundingBox();
|
||||
if (box) {
|
||||
await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
|
||||
clicked = true;
|
||||
console.log(`Clicked age gate at coordinates: ${box.x + box.width / 2}, ${box.y + box.height / 2}`);
|
||||
await page.waitForTimeout(5000);
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('Bounding box click failed');
|
||||
}
|
||||
}
|
||||
|
||||
// Method 3: Try JavaScript click
|
||||
if (!clicked) {
|
||||
const jsClickResult = await page.evaluate(() => {
|
||||
const buttons = Array.from(document.querySelectorAll('button'));
|
||||
for (const btn of buttons) {
|
||||
if (btn.textContent?.includes('Yes')) {
|
||||
btn.click();
|
||||
return { success: true, buttonText: btn.textContent };
|
||||
}
|
||||
}
|
||||
return { success: false };
|
||||
});
|
||||
if (jsClickResult.success) {
|
||||
clicked = true;
|
||||
console.log(`Clicked via JS: ${jsClickResult.buttonText}`);
|
||||
await page.waitForTimeout(5000);
|
||||
}
|
||||
}
|
||||
|
||||
// Method 4: Click element containing "Yes" with dispatchEvent
|
||||
if (!clicked) {
|
||||
const dispatchResult = await page.evaluate(() => {
|
||||
const buttons = Array.from(document.querySelectorAll('button'));
|
||||
for (const btn of buttons) {
|
||||
if (btn.textContent?.includes('Yes')) {
|
||||
btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (dispatchResult) {
|
||||
clicked = true;
|
||||
console.log('Clicked via dispatchEvent');
|
||||
await page.waitForTimeout(5000);
|
||||
}
|
||||
}
|
||||
|
||||
// Log button info for debugging
|
||||
const buttonInfo = await page.evaluate(() => {
|
||||
const buttons = Array.from(document.querySelectorAll('button'));
|
||||
return buttons.map(b => ({
|
||||
text: b.textContent?.trim(),
|
||||
visible: b.offsetParent !== null,
|
||||
rect: b.getBoundingClientRect()
|
||||
}));
|
||||
});
|
||||
console.log('Buttons found:', JSON.stringify(buttonInfo, null, 2));
|
||||
|
||||
} catch (e) {
|
||||
console.log('Age gate handling error:', e);
|
||||
}
|
||||
|
||||
// Wait for content to load after age gate
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Try to scroll to trigger more product loads
|
||||
console.log('Scrolling to load more products...');
|
||||
for (let i = 0; i < 3; i++) {
|
||||
await page.evaluate(() => window.scrollBy(0, 1000));
|
||||
await page.waitForTimeout(1000);
|
||||
}
|
||||
|
||||
// Extract products from the page DOM as backup
|
||||
const domProducts = await page.evaluate(() => {
|
||||
const items: any[] = [];
|
||||
// Try various selectors that Jane might use
|
||||
const productCards = document.querySelectorAll('[data-testid*="product"], [class*="ProductCard"], [class*="product-card"], .product-tile');
|
||||
|
||||
productCards.forEach((card) => {
|
||||
const name = card.querySelector('[class*="name"], [class*="title"], h3, h4')?.textContent?.trim();
|
||||
const brand = card.querySelector('[class*="brand"]')?.textContent?.trim();
|
||||
const price = card.querySelector('[class*="price"]')?.textContent?.trim();
|
||||
const image = card.querySelector('img')?.getAttribute('src');
|
||||
|
||||
if (name) {
|
||||
items.push({ name, brand, price, image, source: 'dom' });
|
||||
}
|
||||
});
|
||||
|
||||
return items;
|
||||
});
|
||||
|
||||
console.log(`Extracted ${domProducts.length} products from DOM`);
|
||||
|
||||
// Check for __NEXT_DATA__ or similar embedded data
|
||||
const embeddedData = await page.evaluate(() => {
|
||||
// Check for Next.js data
|
||||
const nextData = document.getElementById('__NEXT_DATA__');
|
||||
if (nextData) {
|
||||
return { type: 'next', data: JSON.parse(nextData.textContent || '{}') };
|
||||
}
|
||||
|
||||
// Check for any window-level product data
|
||||
const win = window as any;
|
||||
if (win.__INITIAL_STATE__) return { type: 'initial_state', data: win.__INITIAL_STATE__ };
|
||||
if (win.__PRELOADED_STATE__) return { type: 'preloaded', data: win.__PRELOADED_STATE__ };
|
||||
if (win.products) return { type: 'products', data: win.products };
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
if (embeddedData) {
|
||||
console.log(`Found embedded data: ${embeddedData.type}`);
|
||||
apiResponses.push(embeddedData);
|
||||
}
|
||||
|
||||
// Take a screenshot for debugging
|
||||
const screenshotPath = `/tmp/jane-scrape-${Date.now()}.png`;
|
||||
await page.screenshot({ path: screenshotPath, fullPage: true });
|
||||
console.log(`Screenshot saved to ${screenshotPath}`);
|
||||
|
||||
// Process captured API responses
|
||||
console.log('\n=== API Responses Summary ===');
|
||||
for (const resp of apiResponses) {
|
||||
console.log(`Type: ${resp.type}`);
|
||||
if (resp.type === 'algolia' && resp.data.hits) {
|
||||
console.log(` Hits: ${resp.data.hits.length}`);
|
||||
console.log(` Total: ${resp.data.nbHits}`);
|
||||
if (resp.data.hits[0]) {
|
||||
console.log(` Sample product:`, JSON.stringify(resp.data.hits[0], null, 2).substring(0, 1000));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== DOM Products Sample ===');
|
||||
console.log(JSON.stringify(domProducts.slice(0, 3), null, 2));
|
||||
|
||||
console.log('\n=== Captured Credentials ===');
|
||||
console.log(JSON.stringify(capturedCredentials, null, 2));
|
||||
|
||||
return {
|
||||
apiResponses,
|
||||
domProducts,
|
||||
embeddedData,
|
||||
capturedCredentials
|
||||
};
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Main execution
|
||||
const urlOrStoreId = process.argv[2] || 'https://iheartjane.com/aly2djS2yXoTGnR0/DBeqE6HSSwijog9l'; // Default to The Flower Shop Az
|
||||
scrapeJaneMenu(urlOrStoreId)
|
||||
.then((result) => {
|
||||
console.log('\n=== Scrape Complete ===');
|
||||
console.log(`Total API responses captured: ${result.apiResponses.length}`);
|
||||
console.log(`Total DOM products: ${result.domProducts.length}`);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error('Scrape failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user