Remove incorrect migration 029, add snapshot architecture, improve scraper
- Delete migration 029 that was incorrectly creating duplicate dispensaries - Add migration 028 for snapshot architecture - Improve downloader with proxy/UA rotation - Update scraper monitor and tools pages - Various scraper improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -93,7 +93,7 @@ export function ScraperMonitor() {
|
||||
marginBottom: '-2px'
|
||||
}}
|
||||
>
|
||||
Brand Scrape Jobs
|
||||
Dispensary Jobs
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setActiveTab('scrapers')}
|
||||
@@ -109,7 +109,7 @@ export function ScraperMonitor() {
|
||||
marginBottom: '-2px'
|
||||
}}
|
||||
>
|
||||
Legacy Scrapers
|
||||
Crawl History
|
||||
</button>
|
||||
</div>
|
||||
|
||||
@@ -232,10 +232,10 @@ export function ScraperMonitor() {
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start' }}>
|
||||
<div style={{ flex: 1 }}>
|
||||
<div style={{ fontSize: '18px', fontWeight: '600', marginBottom: '8px' }}>
|
||||
{job.brand_name}
|
||||
{job.dispensary_name || job.brand_name}
|
||||
</div>
|
||||
<div style={{ fontSize: '14px', color: '#666', marginBottom: '12px' }}>
|
||||
Worker: {job.worker_id} | Job #{job.id}
|
||||
{job.job_type || 'crawl'} | Job #{job.id}
|
||||
</div>
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '12px' }}>
|
||||
<div>
|
||||
@@ -290,8 +290,8 @@ export function ScraperMonitor() {
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||
<thead>
|
||||
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Brand</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Worker</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Type</th>
|
||||
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Saved</th>
|
||||
@@ -302,8 +302,8 @@ export function ScraperMonitor() {
|
||||
<tbody>
|
||||
{recentJobs.map((job: any) => (
|
||||
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
|
||||
<td style={{ padding: '15px' }}>{job.brand_name}</td>
|
||||
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.worker_id || '-'}</td>
|
||||
<td style={{ padding: '15px' }}>{job.dispensary_name || job.brand_name}</td>
|
||||
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.job_type || '-'}</td>
|
||||
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
@@ -481,22 +481,37 @@ export function ScraperMonitor() {
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||
<thead>
|
||||
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Category</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Status</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
|
||||
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Scraped</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Crawled</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{history.map((item, index) => (
|
||||
<tr key={index} style={{ borderBottom: '1px solid #eee' }}>
|
||||
<td style={{ padding: '15px' }}>{item.store_name}</td>
|
||||
<td style={{ padding: '15px' }}>{item.category_name}</td>
|
||||
<td style={{ padding: '15px' }}>{item.dispensary_name || item.store_name}</td>
|
||||
<td style={{ padding: '15px' }}>
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontWeight: '600',
|
||||
background: item.status === 'completed' ? '#d1fae5' : item.status === 'failed' ? '#fee2e2' : '#fef3c7',
|
||||
color: item.status === 'completed' ? '#065f46' : item.status === 'failed' ? '#991b1b' : '#92400e'
|
||||
}}>
|
||||
{item.status || '-'}
|
||||
</span>
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
|
||||
{item.products_found || '-'}
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
|
||||
{item.product_count}
|
||||
</td>
|
||||
<td style={{ padding: '15px', color: '#666' }}>
|
||||
{new Date(item.last_scraped_at).toLocaleString()}
|
||||
{item.last_scraped_at ? new Date(item.last_scraped_at).toLocaleString() : '-'}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
|
||||
@@ -17,61 +17,61 @@ const USER_AGENTS = {
|
||||
};
|
||||
|
||||
export function ScraperTools() {
|
||||
const [stores, setStores] = useState<any[]>([]);
|
||||
const [selectedStore, setSelectedStore] = useState<number | null>(null);
|
||||
const [dispensaries, setDispensaries] = useState<any[]>([]);
|
||||
const [selectedDispensary, setSelectedDispensary] = useState<number | null>(null);
|
||||
const [parallelScrapers, setParallelScrapers] = useState(3);
|
||||
const [selectedUserAgent, setSelectedUserAgent] = useState<string>('rotate-desktop');
|
||||
const [scraping, setScraping] = useState(false);
|
||||
const [downloadingImages, setDownloadingImages] = useState(false);
|
||||
const [discoveringCategories, setDiscoveringCategories] = useState(false);
|
||||
const [debugging, setDebugging] = useState(false);
|
||||
const [notification, setNotification] = useState<{ message: string; type: 'success' | 'error' | 'info' } | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
loadStores();
|
||||
loadDispensaries();
|
||||
}, []);
|
||||
|
||||
const loadStores = async () => {
|
||||
const loadDispensaries = async () => {
|
||||
setLoading(true);
|
||||
try {
|
||||
const data = await api.getStores();
|
||||
setStores(data.stores);
|
||||
if (data.stores.length > 0) {
|
||||
setSelectedStore(data.stores[0].id);
|
||||
const data = await api.getDispensaries();
|
||||
// Filter to dispensaries that have a menu_url and are scrape enabled
|
||||
const scrapableDispensaries = data.dispensaries.filter((d: any) => d.menu_url && d.scrape_enabled);
|
||||
setDispensaries(scrapableDispensaries);
|
||||
if (scrapableDispensaries.length > 0) {
|
||||
setSelectedDispensary(scrapableDispensaries[0].id);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to load stores:', error);
|
||||
console.error('Failed to load dispensaries:', error);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleScrape = async () => {
|
||||
if (!selectedStore || scraping) return;
|
||||
if (!selectedDispensary || scraping) return;
|
||||
|
||||
setScraping(true);
|
||||
try {
|
||||
await api.scrapeStore(selectedStore, parallelScrapers, selectedUserAgent || undefined);
|
||||
await api.triggerDispensaryCrawl(selectedDispensary);
|
||||
setNotification({
|
||||
message: `Scrape started with ${parallelScrapers} parallel scrapers using ${USER_AGENTS[selectedUserAgent as keyof typeof USER_AGENTS] || 'Random'} UA! Check the Scraper Monitor for progress.`,
|
||||
message: `Crawl started for dispensary! Check the Scraper Monitor for progress.`,
|
||||
type: 'success'
|
||||
});
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Failed to start scrape: ' + error.message, type: 'error' });
|
||||
setNotification({ message: 'Failed to start crawl: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setScraping(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDownloadImages = async () => {
|
||||
if (!selectedStore || downloadingImages) return;
|
||||
if (!selectedDispensary || downloadingImages) return;
|
||||
|
||||
setDownloadingImages(true);
|
||||
try {
|
||||
const result = await api.downloadStoreImages(selectedStore);
|
||||
// TODO: Implement dispensary image download endpoint
|
||||
setNotification({
|
||||
message: `Image download started! ${result.total_missing} missing images will be downloaded.`,
|
||||
message: `Image download feature coming soon!`,
|
||||
type: 'info'
|
||||
});
|
||||
} catch (error: any) {
|
||||
@@ -81,35 +81,7 @@ export function ScraperTools() {
|
||||
}
|
||||
};
|
||||
|
||||
const handleDiscoverCategories = async () => {
|
||||
if (!selectedStore || discoveringCategories) return;
|
||||
|
||||
setDiscoveringCategories(true);
|
||||
try {
|
||||
await api.discoverStoreCategories(selectedStore);
|
||||
setNotification({ message: 'Category discovery started! Check logs for progress.', type: 'info' });
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Failed to start category discovery: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setDiscoveringCategories(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDebug = async () => {
|
||||
if (!selectedStore || debugging) return;
|
||||
|
||||
setDebugging(true);
|
||||
try {
|
||||
await api.debugScrapeStore(selectedStore);
|
||||
setNotification({ message: 'Debug started! Check Logs page for output.', type: 'info' });
|
||||
} catch (error: any) {
|
||||
setNotification({ message: 'Debug failed: ' + error.message, type: 'error' });
|
||||
} finally {
|
||||
setDebugging(false);
|
||||
}
|
||||
};
|
||||
|
||||
const selectedStoreData = stores.find(s => s.id === selectedStore);
|
||||
const selectedDispensaryData = dispensaries.find(d => d.id === selectedDispensary);
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
@@ -133,32 +105,32 @@ export function ScraperTools() {
|
||||
<div className="space-y-6">
|
||||
<div>
|
||||
<h1 className="text-3xl font-bold">Scraper Tools</h1>
|
||||
<p className="text-gray-500 mt-2">Manage scraping operations for your stores</p>
|
||||
<p className="text-gray-500 mt-2">Manage crawling operations for dispensaries</p>
|
||||
</div>
|
||||
|
||||
{/* Store Selection */}
|
||||
{/* Dispensary Selection */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Select Store</h2>
|
||||
<h2 className="card-title">Select Dispensary</h2>
|
||||
<select
|
||||
className="select select-bordered w-full max-w-md"
|
||||
value={selectedStore || ''}
|
||||
onChange={(e) => setSelectedStore(parseInt(e.target.value))}
|
||||
value={selectedDispensary || ''}
|
||||
onChange={(e) => setSelectedDispensary(parseInt(e.target.value))}
|
||||
>
|
||||
{stores.map(store => (
|
||||
<option key={store.id} value={store.id}>
|
||||
{store.name} ({store.product_count || 0} products)
|
||||
{dispensaries.map(disp => (
|
||||
<option key={disp.id} value={disp.id}>
|
||||
{disp.dba_name || disp.name} - {disp.city}, {disp.state}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
|
||||
{selectedStoreData && (
|
||||
{selectedDispensaryData && (
|
||||
<div className="mt-4 p-4 bg-base-200 rounded-lg">
|
||||
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
|
||||
<div>
|
||||
<div className="text-gray-500">Status</div>
|
||||
<div className="font-semibold">
|
||||
{selectedStoreData.scrape_enabled ? (
|
||||
{selectedDispensaryData.scrape_enabled ? (
|
||||
<span className="badge badge-success">Enabled</span>
|
||||
) : (
|
||||
<span className="badge badge-error">Disabled</span>
|
||||
@@ -166,18 +138,18 @@ export function ScraperTools() {
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Categories</div>
|
||||
<div className="font-semibold">{selectedStoreData.category_count || 0}</div>
|
||||
<div className="text-gray-500">Provider</div>
|
||||
<div className="font-semibold">{selectedDispensaryData.provider_type || 'Unknown'}</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Products</div>
|
||||
<div className="font-semibold">{selectedStoreData.product_count || 0}</div>
|
||||
<div className="font-semibold">{selectedDispensaryData.product_count || 0}</div>
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-gray-500">Last Scraped</div>
|
||||
<div className="text-gray-500">Last Crawled</div>
|
||||
<div className="font-semibold">
|
||||
{selectedStoreData.last_scraped_at
|
||||
? new Date(selectedStoreData.last_scraped_at).toLocaleDateString()
|
||||
{selectedDispensaryData.last_crawl_at
|
||||
? new Date(selectedDispensaryData.last_crawl_at).toLocaleDateString()
|
||||
: 'Never'}
|
||||
</div>
|
||||
</div>
|
||||
@@ -189,56 +161,21 @@ export function ScraperTools() {
|
||||
|
||||
{/* Scraper Actions */}
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
|
||||
{/* Scrape Now */}
|
||||
{/* Crawl Now */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Scrape Store</h2>
|
||||
<h2 className="card-title">Crawl Dispensary</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Start scraping products from the selected store
|
||||
Start crawling products from the selected dispensary menu
|
||||
</p>
|
||||
|
||||
<div className="form-control w-full mt-4">
|
||||
<label className="label">
|
||||
<span className="label-text">Parallel Scrapers</span>
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min="1"
|
||||
max="10"
|
||||
value={parallelScrapers}
|
||||
onChange={(e) => setParallelScrapers(parseInt(e.target.value) || 3)}
|
||||
className="input input-bordered w-full"
|
||||
/>
|
||||
<label className="label">
|
||||
<span className="label-text-alt">Number of concurrent scraping processes (1-10)</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div className="form-control w-full mt-4">
|
||||
<label className="label">
|
||||
<span className="label-text">User Agent</span>
|
||||
</label>
|
||||
<select
|
||||
className="select select-bordered w-full"
|
||||
value={selectedUserAgent}
|
||||
onChange={(e) => setSelectedUserAgent(e.target.value)}
|
||||
>
|
||||
{Object.entries(USER_AGENTS).map(([key, label]) => (
|
||||
<option key={key} value={key}>{label}</option>
|
||||
))}
|
||||
</select>
|
||||
<label className="label">
|
||||
<span className="label-text-alt">Browser/bot identity for scraping session</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div className="card-actions justify-end mt-4">
|
||||
<button
|
||||
onClick={handleScrape}
|
||||
disabled={!selectedStore || scraping}
|
||||
disabled={!selectedDispensary || scraping}
|
||||
className={`btn btn-primary ${scraping ? 'loading' : ''}`}
|
||||
>
|
||||
{scraping ? 'Scraping...' : 'Start Scrape'}
|
||||
{scraping ? 'Starting...' : 'Start Crawl'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
@@ -249,13 +186,13 @@ export function ScraperTools() {
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Download Images</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Download missing product images for the selected store
|
||||
Download missing product images for the selected dispensary
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDownloadImages}
|
||||
disabled={!selectedStore || downloadingImages}
|
||||
disabled={!selectedDispensary || downloadingImages}
|
||||
className={`btn btn-secondary ${downloadingImages ? 'loading' : ''}`}
|
||||
>
|
||||
{downloadingImages ? 'Downloading...' : 'Download Missing Images'}
|
||||
@@ -263,46 +200,6 @@ export function ScraperTools() {
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Discover Categories */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Discover Categories</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Automatically discover and create categories from the store
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDiscoverCategories}
|
||||
disabled={!selectedStore || discoveringCategories}
|
||||
className={`btn btn-accent ${discoveringCategories ? 'loading' : ''}`}
|
||||
>
|
||||
{discoveringCategories ? 'Discovering...' : 'Discover Categories'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Debug Scraper */}
|
||||
<div className="card bg-base-100 shadow-xl">
|
||||
<div className="card-body">
|
||||
<h2 className="card-title">Debug Scraper</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Run scraper in debug mode and view detailed logs
|
||||
</p>
|
||||
|
||||
<div className="card-actions justify-end mt-auto">
|
||||
<button
|
||||
onClick={handleDebug}
|
||||
disabled={!selectedStore || debugging}
|
||||
className={`btn btn-warning ${debugging ? 'loading' : ''}`}
|
||||
>
|
||||
{debugging ? 'Debugging...' : 'Start Debug'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Quick Links */}
|
||||
|
||||
Reference in New Issue
Block a user