182 lines
5.3 KiB
TypeScript
182 lines
5.3 KiB
TypeScript
import { firefox } from 'playwright';
|
|
import { pool } from './src/db/migrate.js';
|
|
import { getRandomProxy } from './src/utils/proxyManager.js';
|
|
|
|
const dispensaryId = parseInt(process.argv[2] || '112', 10);
|
|
|
|
interface Brand {
|
|
slug: string;
|
|
name: string;
|
|
url: string;
|
|
}
|
|
|
|
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
|
|
try {
|
|
const brandsUrl = `${menuUrl}/brands`;
|
|
console.log(`📄 Loading brands page: ${brandsUrl}`);
|
|
|
|
await page.goto(brandsUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log(`⏳ Waiting for brands to render...`);
|
|
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
|
|
console.log(`✅ Brands appeared!`);
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scroll to load all brands
|
|
console.log(`📜 Scrolling to load all brands...`);
|
|
let previousHeight = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrolls = 10;
|
|
|
|
while (scrollAttempts < maxScrolls) {
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(1500);
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) break;
|
|
|
|
previousHeight = currentHeight;
|
|
scrollAttempts++;
|
|
}
|
|
|
|
// Extract all brand links
|
|
const brands = await page.evaluate(() => {
|
|
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
|
|
|
|
const extracted = brandLinks.map(link => {
|
|
const href = link.getAttribute('href') || '';
|
|
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
|
|
|
|
// Get brand name from the ContentWrapper div to avoid placeholder letter duplication
|
|
const contentWrapper = link.querySelector('[class*="ContentWrapper"]');
|
|
const name = contentWrapper?.textContent?.trim() || link.textContent?.trim() || slug;
|
|
|
|
return {
|
|
slug,
|
|
name,
|
|
url: href.startsWith('http') ? href : href
|
|
};
|
|
});
|
|
|
|
// Filter out duplicates and invalid entries
|
|
const seen = new Set();
|
|
const unique = extracted.filter(b => {
|
|
if (!b.slug || !b.name || seen.has(b.slug)) return false;
|
|
seen.add(b.slug);
|
|
return true;
|
|
});
|
|
|
|
return unique;
|
|
});
|
|
|
|
console.log(`✅ Found ${brands.length} total brands`);
|
|
return brands;
|
|
|
|
} catch (error: any) {
|
|
console.error(`❌ Error scraping brands list:`, error.message);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`🏭 POPULATING JOB QUEUE`);
|
|
console.log(` Dispensary ID: ${dispensaryId}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
// Get dispensary info
|
|
const dispensaryResult = await pool.query(
|
|
"SELECT id, name, menu_url FROM dispensaries WHERE id = $1",
|
|
[dispensaryId]
|
|
);
|
|
|
|
if (dispensaryResult.rows.length === 0) {
|
|
console.error(`❌ Dispensary ID ${dispensaryId} not found`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const menuUrl = dispensaryResult.rows[0].menu_url;
|
|
console.log(`✅ Dispensary: ${dispensaryResult.rows[0].name}`);
|
|
console.log(` Menu URL: ${menuUrl}\n`);
|
|
|
|
// Get proxy
|
|
const proxy = await getRandomProxy();
|
|
if (!proxy) {
|
|
console.log(`❌ No proxy available`);
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`🔐 Using proxy: ${proxy.server}\n`);
|
|
|
|
// Launch browser
|
|
const browser = await firefox.launch({ headless: true });
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
// Get all brands
|
|
const allBrands = await scrapeBrandsList(menuUrl, context, page);
|
|
|
|
if (allBrands.length === 0) {
|
|
console.log(`❌ No brands found`);
|
|
await browser.close();
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`\n📋 Found ${allBrands.length} brands. Populating job queue...\n`);
|
|
|
|
// Insert jobs into database
|
|
let inserted = 0;
|
|
let skipped = 0;
|
|
|
|
for (const brand of allBrands) {
|
|
try {
|
|
await pool.query(`
|
|
INSERT INTO brand_scrape_jobs (dispensary_id, brand_slug, brand_name, status)
|
|
VALUES ($1, $2, $3, 'pending')
|
|
ON CONFLICT (dispensary_id, brand_slug) DO NOTHING
|
|
`, [dispensaryId, brand.slug, brand.name]);
|
|
|
|
const result = await pool.query(
|
|
'SELECT id FROM brand_scrape_jobs WHERE dispensary_id = $1 AND brand_slug = $2',
|
|
[dispensaryId, brand.slug]
|
|
);
|
|
|
|
if (result.rows.length > 0) {
|
|
inserted++;
|
|
if (inserted % 10 === 0) {
|
|
console.log(` Inserted ${inserted}/${allBrands.length} jobs...`);
|
|
}
|
|
} else {
|
|
skipped++;
|
|
}
|
|
} catch (error: any) {
|
|
console.error(`❌ Error inserting job for ${brand.name}:`, error.message);
|
|
}
|
|
}
|
|
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`✅ JOB QUEUE POPULATED`);
|
|
console.log(` Total brands: ${allBrands.length}`);
|
|
console.log(` Jobs inserted: ${inserted}`);
|
|
console.log(` Jobs skipped (already exist): ${skipped}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch(console.error);
|