主要使用语言 & 框架: Python & Scrapy
url(简化) 'https://search.jd.com/Search?keyword={}' method GET headers {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
,获取 商品id & 价格
url(简化) 'https://search.jd.com/s_new.php?keyword={搜索词}&page={页数}&s={分页数}' method GET headers {'referer': 'https://search.jd.com/Search'}
使用商品 id,请求商品页面,获得相关信息
url https://item.jd.com/{商品id}.html method GET
url) 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={商品id} method GET headers {'referer': 'https://search.jd.com/Search'}
创建工程 & 爬虫文件
scrapy startproject jdApp cd jdApp scrapy genspider jdspider jd.com
修改 setting.py 文件
# 伪装 # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/689.36 (KHTML, like Gecko) Chrome/99.0.5204.108 Safari/637.75' # Obey robots.txt rules ROBOTSTXT_OBEY = False # 下载延迟 DOWNLOAD_DELAY = 1 # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'referer': 'https://search.jd.com/Search', # 京东会检测这个 }
class JdspiderSpider(scrapy.Spider): name = 'jdspider' allowed_domains = ['jd.com'] # 爬取的商品 good_name = "" # 第一次请求 获取全部商品全部页数 search_url = 'https://search.jd.com/Search?keyword={}' # 第一次搜索接口 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'} # 获取商品接口 每次请求返回 30 个 goods_url_api = 'https://search.jd.com/s_new.php?keyword={}&page={}&s={}' # 搜索接口 # 获取页数 请求每一页 def start_requests(self): pass # 获取每一个商品 id ,请求商品详情数据 def parse(self, response: Response): pass # 获取商品详情数据,请求评论信息 def get_infor(self, response: Response): pass # 得到评论信息,返回 item def get_comment(self, response: Response): pass if __name__ == '__main__': # 启动 from scrapy import cmdline cmdline.execute("scrapy runspider jdspider.py".split())
# 获取页数 请求每一页 def start_requests(self): self.good_name = input("输入爬取的关键词:") r = requests.get(url=self.search_url.format(self.good_name), headers=self.headers) # 获取总页数 all_pages = re.search(r'<b>1</b><em>/</em><i>(\d*?)</i>',r.text).group(1) logger.info(msg= f"总页数 {all_pages}") for _ in range(int(all_pages) * 2): # 组合每一页的链接 goods_url_api = self.goods_url_api.format(self.good_name, _ + 1, 1 + 25 * ( _ + 1 - 1)) logger.info(msg=f"请求链接:{goods_url_api}") # 请求每一页的商品 yield Request( url= goods_url_api, callback= self.parse )
parse(self, response: Response)
# 获取每一个商品 id ,请求商品详情数据 def parse(self, response: Response): item = JdappItem() drug_list: list[Selector] = response.xpath("//div[@id='J_goodsList']/ul/li") for _ in drug_list: # 商品价格 item['good_price'] = _.xpath('//div[@class="p-price"]/strong/i/text()').get() # 商品 id item['good_id'] = _.xpath("@data-sku").get() url = f'https://item.jd.com/{item["good_id"]}.html' logger.info(msg=f"正在爬取商品详情:{url}") yield Request( url=url, meta={'item': copy.deepcopy(item), 'url': url}, callback=self.get_infor, )
get_infor(self, response: Response)
# 获取商品详情数据,请求评论信息 def get_infor(self, response: Response): item = response.meta['item'] # 商品连接 item['good_url'] = response.url infor = dict() # 商品介绍 parameter: list[str] = response.xpath('//div[@class="p-parameter"]/ul[2]/li/text()').getall() for _ in parameter: i =_.split(":") infor[i[0]] = i[1] # 规格与包装 size_packing_key: list[str] = response.xpath('//div[@class="Ptable"]/div/dl/dl/dt/text()').getall() size_packing_value: list[str] = response.xpath('//div[@class="Ptable"]/div/dl/dl/dd/text()').getall() for _ in zip(size_packing_key,size_packing_value): infor[_[0]] = _[1] logger.info(msg=f"商品参数规格信息 {infor}") # 商品信息 item['good_infor'] = infor # 获取评论 url = f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={item["good_id"]}' logger.info(msg=f"正在爬取商品评论 {url}") yield Request( url= url, meta={'item': copy.deepcopy(item)}, callback=self.get_comment, dont_filter=True )
get_comment(self, response: Response)
# 得到评论信息,返回 item def get_comment(self, response: Response): item = response.meta['item'] comment = json.loads(response.text) # 评论数 item['comment_number'] = comment.get('CommentsCount')[0].get('CommentCountStr') # 好评 item['good_comment'] = comment.get('CommentsCount')[0].get('GoodCountStr') # 中评 item['medium_comment'] = comment.get('CommentsCount')[0].get('GeneralCountStr') # 差评 item['bad_comment'] = comment.get('CommentsCount')[0].get('PoorCountStr') yield item
时,需要经常更换 IP,不要容易什么都获取不到,源码里的动态转发需要更改为自己的秘钥和订单
注意每个 Request请求 meta 的传递方式
