const BaseCrawler = require("../base/BaseCrawler"); const CrawlerError = require("../../errors/CrawlerError"); const path = require("path"); const fs = require("fs"); const fsPromises = require("fs").promises; const axios = require("axios"); const FormData = require("form-data"); /** * 亚马逊爬虫实现类 */ class AmazonCrawler extends BaseCrawler { constructor(config) { super(config); this.selectors = { title: "#productTitle", price: "span.a-price > span.a-offscreen", coupon: '.a-declarative[data-action="a-modal"], .couponLabelText', variants: ".a-cardui-body #twister-plus-inline-twister > .a-section", point: "#points_feature_div .a-color-price", }; this.timeouts = config.timeouts || { pageLoad: 60000, // 页面加载超时时间 elementWait: 10000, // 元素等待超时时间 networkIdle: 5000, // 网络空闲超时时间 }; this.retryConfig = config.retry || { maxAttempts: 1, // 最大重试次数 delay: 2000, // 重试延迟时间 }; this.uploadConfig = config.common?.upload || { url: "https://apibase.sohomall.jp/uploaders", scene: "digital-yy", timeout: 600000, }; } /** * 创建截图目录 * @returns {Promise} 截图目录路径 */ async createScreenshotDir() { const dir = path.join(process.cwd(), "screenshots"); try { await fsPromises.mkdir(dir, { recursive: true }); } catch (error) { if (error.code !== "EEXIST") { throw new CrawlerError( "创建截图目录失败", "SCREENSHOT_DIR_ERROR", "amazon", error ); } } return dir; } /** * 上传图片到服务器 * @param {string} imagePath - 图片路径 * @returns {Promise} 图片URL */ async uploadImage(imagePath) { try { const formData = new FormData(); formData.append("file", fs.createReadStream(imagePath)); formData.append("scene", this.uploadConfig.scene); const response = await axios.post(this.uploadConfig.url, formData, { headers: { ...formData.getHeaders(), "Content-Type": "multipart/form-data", }, timeout: this.uploadConfig.timeout, }); if (!response.data || !response.data.url) { throw new Error("上传响应格式错误"); } return response.data.url; } catch (error) { if (error.response) { throw new CrawlerError( `图片上传失败: ${error.response.status} ${error.response.statusText}`, "IMAGE_UPLOAD_ERROR", "amazon", error ); } throw new CrawlerError( "图片上传失败", "IMAGE_UPLOAD_ERROR", "amazon", error ); } } /** * 带重试的页面导航 * @param {string} url - 目标URL * @returns {Promise} */ async navigateWithRetry(url) { let lastError; for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { try { await this.page.goto(url, { waitUntil: "networkidle", timeout: this.timeouts.pageLoad, }); return; } catch (error) { lastError = error; console.log( `导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message ); if (attempt < this.retryConfig.maxAttempts) { console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); await new Promise((resolve) => setTimeout(resolve, this.retryConfig.delay) ); // 重新初始化浏览器 await this.closeBrowser(); await this.initBrowser(); } } } throw new CrawlerError( `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, "NAVIGATION_ERROR", "amazon", lastError ); } /** * 等待元素出现 * @param {string} selector - 选择器 * @returns {Promise} */ async waitForElement(selector) { try { await this.page.waitForSelector(selector, { timeout: this.timeouts.elementWait, }); } catch (error) { throw new CrawlerError( `等待元素超时: ${selector}`, "ELEMENT_WAIT_ERROR", "amazon", error ); } } /** * 解析价格 * @param {string} priceText - 价格文本 * @returns {Promise} 解析后的价格 */ async parsePrice(priceText) { try { if (!priceText) return null; // 移除货币符号和空格 priceText = priceText.replace(/[¥JP¥\s]/g, ""); // 如果包含积分信息,只取价格部分 if (priceText.includes("ポイント")) { priceText = priceText.split("ポイント")[0].trim(); } // 提取数字部分 const match = priceText.match(/([\d,]+)/); if (!match) return null; // 转换价格 return parseInt(match[1].replace(/,/g, "")); } catch (error) { throw new CrawlerError( "价格解析失败", "PRICE_PARSE_ERROR", "amazon", error ); } } /** * 处理积分 * @returns {Promise} 积分金额 */ async handlePoint() { try { let pointValue = 0; const pointTrigger = await this.page.$(this.selectors.point); if (!pointTrigger) { return 0; // 没有积分,直接返回0 } else { const pointText = await this.page.$eval(this.selectors.point, (el) => el.textContent.trim() ); const match = pointText.match(/\d+/); if (match) { pointValue = match[0]; } return pointValue; } } catch (error) { console.log("积分处理失败:", error.message); return 0; // 发生错误时返回0,而不是抛出异常 } } /** * 处理优惠券 * @returns {Promise} 优惠券金额 */ async handleCoupon() { try { let couponValue = 0; // 等待优惠券元素出现 const couponTrigger = await this.page.$(this.selectors.coupon); if (!couponTrigger) { return 0; // 没有优惠券,直接返回0 } try { // 点击优惠券按钮 await couponTrigger.click(); await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示 // 等待优惠券文本出现 const couponText = await this.page.$eval(".couponLabelText", (el) => el.textContent.trim() ); // 解析优惠券金额 const match = couponText.match(/¥\s*([\d,]+)/); if (match) { couponValue = parseInt(match[1].replace(/,/g, "")); } // 尝试关闭弹窗 try { await this.page.click("button.a-modal-close", { timeout: 2000 }); } catch (closeError) { // 如果找不到关闭按钮,尝试按ESC键 await this.page.keyboard.press("Escape"); } // 等待弹窗消失 await this.page.waitForTimeout(500); } catch (clickError) { console.log("没有优惠券", clickError.message); // 如果点击失败,尝试按ESC键关闭可能的弹窗 try { await this.page.keyboard.press("Escape"); } catch (escError) { console.log("ESC键关闭失败:", escError.message); } } return couponValue; } catch (error) { console.log("优惠券处理失败:", error.message); return 0; // 发生错误时返回0,而不是抛出异常 } } /** * 获取商品标题 * @returns {Promise} 商品标题 */ async getTitle() { try { return await this.page.$eval(this.selectors.title, (el) => el.textContent.trim() ); } catch (error) { throw new CrawlerError( "获取标题失败", "TITLE_GET_ERROR", "amazon", error ); } } /** * 获取商品SKU * @returns {Promise} 商品SKU */ async getSku() { try { const url = this.page.url(); return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; } catch (error) { throw new CrawlerError("获取SKU失败", "SKU_GET_ERROR", "amazon", error); } } /** * 获取商品变体信息 * @returns {Promise} 变体信息数组 */ async getVariants() { try { await this.page.waitForSelector(this.selectors.variants); const groupEls = await this.page.$$(this.selectors.variants); const groups = []; for (const groupEl of groupEls) { const btns = await groupEl.$$(".a-button-inner .a-button-input"); if (btns.length) groups.push(btns); } return groups; } catch (error) { throw new CrawlerError( "获取变体信息失败", "VARIANTS_GET_ERROR", "amazon", error ); } } /** * 获取单个SKU信息 * @returns {Promise} SKU信息 */ async getSingleSkuInfo() { try { // 等待页面加载完成 await this.page.waitForLoadState("networkidle"); // 等待标题元素出现 await this.waitForElement(this.selectors.title); // 处理优惠券 const couponValue = await this.handleCoupon(); // 处理积分 const pointValue = await this.handlePoint(); // 获取商品信息 const info = await this.page.evaluate( ({ selectors, couponValue, pointValue }) => { const title = document.querySelector(selectors.title)?.textContent.trim() || null; let priceText = document.querySelector(selectors.price)?.textContent.trim() || null; // 处理价格文本 if (priceText?.includes("ポイント")) { priceText = priceText.split("ポイント")[0].trim(); } // 解析价格 const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, "")) - couponValue - pointValue : null; const url = window.location.href; const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; return { title, price: price ? price.toLocaleString() : null, sku: asin, url, }; }, { selectors: this.selectors, couponValue, pointValue } ); // 验证必要信息 if (!info.title || !info.price || !info.sku) { throw new Error("商品信息不完整"); } return info; } catch (error) { throw new CrawlerError( "获取SKU信息失败", "SKU_INFO_GET_ERROR", "amazon", error ); } } /** * 主方法:抓取商品信息 * @param {string} url - 商品URL * @param {boolean} needScreenshot - 是否需要截图 * @returns {Promise} 商品信息数组 */ async crawl(url, needScreenshot = false) { try { await this.initBrowser(); // 设置页面超时 this.page.setDefaultTimeout(this.timeouts.elementWait); this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); // 设置请求拦截 await this.page.route("**/*", (route) => { const resourceType = route.request().resourceType(); // 只阻止字体和媒体资源,允许加载图片 if (["font", "media"].includes(resourceType)) { route.abort(); } else { route.continue(); } }); // 导航到目标页面 await this.navigateWithRetry(url.split("?")[0]); // 只获取单个SKU信息 const data = [await this.getSingleSkuInfo()]; if (needScreenshot) { try { const dir = await this.createScreenshotDir(); const filename = `${Date.now()}.png`; const shot = path.join(dir, filename); // 等待页面完全加载 await this.page.waitForLoadState("networkidle"); // 截取全页面 await this.page.screenshot({ path: shot, fullPage: true, timeout: this.timeouts.elementWait, }); // 上传图片并获取URL const imageUrl = await this.uploadImage(shot); // 更新数据,添加图片URL data.forEach((item) => { item.screenshotUrl = imageUrl; }); // 删除临时文件 try { await fsPromises.unlink(shot); } catch (error) { console.error("删除临时截图文件失败:", error); } } catch (error) { console.error("截图处理失败:", error); // 截图失败不影响主流程 } } return data; } catch (error) { throw new CrawlerError( "商品信息抓取失败", "CRAWL_ERROR", "amazon", error ); } finally { await this.closeBrowser(); } } } module.exports = AmazonCrawler;