const BaseCrawler = require('../base/BaseCrawler'); const CrawlerError = require('../../errors/CrawlerError'); const path = require('path'); const fs = require('fs'); const fsPromises = require('fs').promises; const axios = require('axios'); const FormData = require('form-data'); /** * 亚马逊爬虫实现类 */ class AmazonCrawler extends BaseCrawler { constructor(config) { super(config); this.selectors = { title: '#productTitle', price: 'span.a-price > span.a-offscreen', coupon: '.a-declarative[data-action="a-modal"], .couponLabelText', variants: '.a-cardui-body #twister-plus-inline-twister > .a-section' }; this.timeouts = config.timeouts || { pageLoad: 60000, // 页面加载超时时间 elementWait: 10000, // 元素等待超时时间 networkIdle: 5000 // 网络空闲超时时间 }; this.retryConfig = config.retry || { maxAttempts: 3, // 最大重试次数 delay: 2000 // 重试延迟时间 }; this.uploadConfig = config.common?.upload || { url: 'https://apibase.sohomall.jp/uploaders', scene: 'goods', timeout: 600000 }; } /** * 创建截图目录 * @returns {Promise} 截图目录路径 */ async createScreenshotDir() { const dir = path.join(process.cwd(), 'screenshots'); try { await fsPromises.mkdir(dir, { recursive: true }); } catch (error) { if (error.code !== 'EEXIST') { throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error); } } return dir; } /** * 上传图片到服务器 * @param {string} imagePath - 图片路径 * @returns {Promise} 图片URL */ async uploadImage(imagePath) { try { const formData = new FormData(); formData.append('file', fs.createReadStream(imagePath)); formData.append('scene', this.uploadConfig.scene); const response = await axios.post(this.uploadConfig.url, formData, { headers: { ...formData.getHeaders(), 'Content-Type': 'multipart/form-data' }, timeout: this.uploadConfig.timeout }); if (!response.data || !response.data.url) { throw new Error('上传响应格式错误'); } return response.data.url; } catch (error) { if (error.response) { throw new CrawlerError( `图片上传失败: ${error.response.status} ${error.response.statusText}`, 'IMAGE_UPLOAD_ERROR', 'amazon', error ); } throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error); } } /** * 带重试的页面导航 * @param {string} url - 目标URL * @returns {Promise} */ async navigateWithRetry(url) { let lastError; for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { try { await this.page.goto(url, { waitUntil: 'networkidle', timeout: this.timeouts.pageLoad }); return; } catch (error) { lastError = error; console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message); if (attempt < this.retryConfig.maxAttempts) { console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay)); // 重新初始化浏览器 await this.closeBrowser(); await this.initBrowser(); } } } throw new CrawlerError( `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, 'NAVIGATION_ERROR', 'amazon', lastError ); } /** * 等待元素出现 * @param {string} selector - 选择器 * @returns {Promise} */ async waitForElement(selector) { try { await this.page.waitForSelector(selector, { timeout: this.timeouts.elementWait }); } catch (error) { throw new CrawlerError( `等待元素超时: ${selector}`, 'ELEMENT_WAIT_ERROR', 'amazon', error ); } } /** * 解析价格 * @param {string} priceText - 价格文本 * @returns {Promise} 解析后的价格 */ async parsePrice(priceText) { try { if (!priceText) return null; // 移除货币符号和空格 priceText = priceText.replace(/[¥JP¥\s]/g, ''); // 如果包含积分信息,只取价格部分 if (priceText.includes('ポイント')) { priceText = priceText.split('ポイント')[0].trim(); } // 提取数字部分 const match = priceText.match(/([\d,]+)/); if (!match) return null; // 转换价格 return parseInt(match[1].replace(/,/g, '')); } catch (error) { throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error); } } /** * 处理优惠券 * @returns {Promise} 优惠券金额 */ async handleCoupon() { try { let couponValue = 0; // 等待优惠券元素出现 const couponTrigger = await this.page.$(this.selectors.coupon); if (!couponTrigger) { return 0; // 没有优惠券,直接返回0 } try { // 点击优惠券按钮 await couponTrigger.click(); await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示 // 等待优惠券文本出现 const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim()); // 解析优惠券金额 const match = couponText.match(/¥\s*([\d,]+)/); if (match) { couponValue = parseInt(match[1].replace(/,/g, '')); } // 尝试关闭弹窗 try { await this.page.click('button.a-modal-close', { timeout: 2000 }); } catch (closeError) { // 如果找不到关闭按钮,尝试按ESC键 await this.page.keyboard.press('Escape'); } // 等待弹窗消失 await this.page.waitForTimeout(500); } catch (clickError) { console.log('优惠券点击或处理失败:', clickError.message); // 如果点击失败,尝试按ESC键关闭可能的弹窗 try { await this.page.keyboard.press('Escape'); } catch (escError) { console.log('ESC键关闭失败:', escError.message); } } return couponValue; } catch (error) { console.log('优惠券处理失败:', error.message); return 0; // 发生错误时返回0,而不是抛出异常 } } /** * 获取商品标题 * @returns {Promise} 商品标题 */ async getTitle() { try { return await this.page.$eval(this.selectors.title, el => el.textContent.trim()); } catch (error) { throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error); } } /** * 获取商品SKU * @returns {Promise} 商品SKU */ async getSku() { try { const url = this.page.url(); return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; } catch (error) { throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error); } } /** * 获取商品变体信息 * @returns {Promise} 变体信息数组 */ async getVariants() { try { await this.page.waitForSelector(this.selectors.variants); const groupEls = await this.page.$$(this.selectors.variants); const groups = []; for (const groupEl of groupEls) { const btns = await groupEl.$$('.a-button-inner .a-button-input'); if (btns.length) groups.push(btns); } return groups; } catch (error) { throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error); } } /** * 获取单个SKU信息 * @returns {Promise} SKU信息 */ async getSingleSkuInfo() { try { // 等待页面加载完成 await this.page.waitForLoadState('networkidle'); // 等待标题元素出现 await this.waitForElement(this.selectors.title); // 处理优惠券 const couponValue = await this.handleCoupon(); // 获取商品信息 const info = await this.page.evaluate(({ selectors, couponValue }) => { const title = document.querySelector(selectors.title)?.textContent.trim() || null; let priceText = document.querySelector(selectors.price)?.textContent.trim() || null; // 处理价格文本 if (priceText?.includes('ポイント')) { priceText = priceText.split('ポイント')[0].trim(); } // 解析价格 const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null; const url = window.location.href; const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; return { title, price: price ? price.toLocaleString() : null, sku: asin, url, remark: couponValue > 0 ? `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` : null }; }, { selectors: this.selectors, couponValue }); // 验证必要信息 if (!info.title || !info.price || !info.sku) { throw new Error('商品信息不完整'); } return info; } catch (error) { throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error); } } /** * 获取所有SKU组合信息 * @returns {Promise} SKU信息数组 */ async getAllSkuInfo() { try { const groups = await this.getVariants(); if (!groups.length) return [await this.getSingleSkuInfo()]; // 生成笛卡尔积组合 const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b])); let combos = groups[0].map(b => [b]); for (let i = 1; i < groups.length; i++) { combos = cartesian(combos, groups[i]); } const results = []; for (const combo of combos) { // 依次点击每个维度按钮 for (const btn of combo) { await btn.click(); await this.page.waitForLoadState('networkidle'); } // 获取当前组合信息 const info = await this.getSingleSkuInfo(); info.variants = await Promise.all( combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title')) ); results.push(info); } return results; } catch (error) { throw new CrawlerError('获取所有SKU信息失败', 'ALL_SKU_INFO_GET_ERROR', 'amazon', error); } } /** * 主方法:抓取商品信息 * @param {string} url - 商品URL * @param {boolean} needScreenshot - 是否需要截图 * @param {boolean} includeAllSkus - 是否包含所有SKU * @returns {Promise} 商品信息数组 */ async crawl(url, needScreenshot = false, includeAllSkus = false) { try { await this.initBrowser(); // 设置页面超时 this.page.setDefaultTimeout(this.timeouts.elementWait); this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); // 设置请求拦截 await this.page.route('**/*', route => { const resourceType = route.request().resourceType(); // 只阻止字体和媒体资源,允许加载图片 if (['font', 'media'].includes(resourceType)) { route.abort(); } else { route.continue(); } }); // 导航到目标页面 await this.navigateWithRetry(url.split('?')[0]); const data = includeAllSkus ? await this.getAllSkuInfo() : [await this.getSingleSkuInfo()]; if (needScreenshot) { try { const dir = await this.createScreenshotDir(); const filename = `${Date.now()}.png`; const shot = path.join(dir, filename); // 等待页面完全加载 await this.page.waitForLoadState('networkidle'); // 截取全页面 await this.page.screenshot({ path: shot, fullPage: true, timeout: this.timeouts.elementWait }); // 上传图片并获取URL const imageUrl = await this.uploadImage(shot); // 更新数据,添加图片URL data.forEach(item => { item.screenshotUrl = imageUrl; }); // 删除临时文件 try { await fsPromises.unlink(shot); } catch (error) { console.error('删除临时截图文件失败:', error); } } catch (error) { console.error('截图处理失败:', error); // 截图失败不影响主流程 } } return data; } catch (error) { throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error); } finally { await this.closeBrowser(); } } } module.exports = AmazonCrawler;