【爬虫项目】京东商城

说明

主要使用语言 & 框架: Python & Scrapy

源码地址:https://pan.bigdataboy.cn/#/s/6euy

mark

爬取思路

第一次搜索:获得所有页数

url(简化) 'https://search.jd.com/Search?keyword={}'

method GET

headers {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}

mark

直接请求商品接口,获取 商品id & 价格

url(简化) 'https://search.jd.com/s_new.php?keyword={搜索词}&page={页数}&s={分页数}'

method GET

headers {'referer': 'https://search.jd.com/Search'}

mark

使用商品 id,请求商品页面,获得相关信息

url https://item.jd.com/{商品id}.html

method GET

mark

请求评论接口,获取评论相关数量

url) 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={商品id}

method GET

headers {'referer': 'https://search.jd.com/Search'}

mark

编写爬虫

常规设置

创建工程 & 爬虫文件

scrapy startproject jdApp
cd jdApp
scrapy genspider jdspider jd.com

修改 setting.py 文件

# 伪装
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/689.36 (KHTML, like Gecko) Chrome/99.0.5204.108 Safari/637.75'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 下载延迟
DOWNLOAD_DELAY = 1

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'referer': 'https://search.jd.com/Search', # 京东会检测这个
}

总体架构

class JdspiderSpider(scrapy.Spider):
    name = 'jdspider'
    allowed_domains = ['jd.com']

    # 爬取的商品
    good_name = ""

    # 第一次请求 获取全部商品全部页数
    search_url = 'https://search.jd.com/Search?keyword={}'  # 第一次搜索接口
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}

    # 获取商品接口 每次请求返回 30 个
    goods_url_api = 'https://search.jd.com/s_new.php?keyword={}&page={}&s={}'  # 搜索接口

    # 获取页数 请求每一页
    def start_requests(self):
        pass

    # 获取每一个商品 id ,请求商品详情数据
    def parse(self, response: Response):
        pass

    # 获取商品详情数据,请求评论信息
    def get_infor(self, response: Response):
        pass

    # 得到评论信息,返回 item 
    def get_comment(self, response: Response):
        pass

if __name__ == '__main__':
    # 启动
    from scrapy import cmdline
    cmdline.execute("scrapy runspider jdspider.py".split())

start_requests(self)

# 获取页数 请求每一页
def start_requests(self):
    self.good_name = input("输入爬取的关键词:")
    r = requests.get(url=self.search_url.format(self.good_name), headers=self.headers)
    # 获取总页数
    all_pages = re.search(r'<b>1</b><em>/</em><i>(\d*?)</i>',r.text).group(1)
    logger.info(msg= f"总页数 {all_pages}")

    for _ in range(int(all_pages) * 2):
        # 组合每一页的链接
        goods_url_api = self.goods_url_api.format(self.good_name, _ + 1, 1 + 25 * ( _ + 1 - 1))
        logger.info(msg=f"请求链接:{goods_url_api}")
        # 请求每一页的商品
        yield Request(
            url= goods_url_api,
            callback= self.parse
        )

parse(self, response: Response)

# 获取每一个商品 id ,请求商品详情数据
def parse(self, response: Response):
    item = JdappItem()
    drug_list: list[Selector] = response.xpath("//div[@id='J_goodsList']/ul/li")
    for _ in drug_list:
        # 商品价格
        item['good_price'] = _.xpath('//div[@class="p-price"]/strong/i/text()').get()
        # 商品 id
        item['good_id'] = _.xpath("@data-sku").get()
        url = f'https://item.jd.com/{item["good_id"]}.html'
        logger.info(msg=f"正在爬取商品详情:{url}")

        yield Request(
            url=url,
            meta={'item': copy.deepcopy(item), 'url': url},
            callback=self.get_infor,
        )

get_infor(self, response: Response)

# 获取商品详情数据,请求评论信息
def get_infor(self, response: Response):
    item = response.meta['item']
    # 商品连接
    item['good_url'] = response.url

    infor = dict()
    # 商品介绍
    parameter: list[str] = response.xpath('//div[@class="p-parameter"]/ul[2]/li/text()').getall()
    for _ in  parameter:
        i =_.split(":")
        infor[i[0]] = i[1]
    # 规格与包装
    size_packing_key: list[str] = response.xpath('//div[@class="Ptable"]/div/dl/dl/dt/text()').getall()
    size_packing_value: list[str] = response.xpath('//div[@class="Ptable"]/div/dl/dl/dd/text()').getall()
    for _ in zip(size_packing_key,size_packing_value):
        infor[_[0]] = _[1]

    logger.info(msg=f"商品参数规格信息 {infor}")
    # 商品信息
    item['good_infor'] = infor

    # 获取评论
    url = f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={item["good_id"]}'
    logger.info(msg=f"正在爬取商品评论 {url}")
    yield Request(
        url= url,
        meta={'item': copy.deepcopy(item)},
        callback=self.get_comment,
        dont_filter=True
    )

get_comment(self, response: Response)

# 得到评论信息,返回 item
def get_comment(self, response: Response):
    item = response.meta['item']
    comment = json.loads(response.text)
    # 评论数
    item['comment_number'] = comment.get('CommentsCount')[0].get('CommentCountStr')
    # 好评
    item['good_comment'] = comment.get('CommentsCount')[0].get('GoodCountStr')
    # 中评
    item['medium_comment'] = comment.get('CommentsCount')[0].get('GeneralCountStr')
    # 差评
    item['bad_comment'] = comment.get('CommentsCount')[0].get('PoorCountStr')
    yield item

tip

在爬取商品详情页时,需要经常更换 IP,不要容易什么都获取不到,源码里的动态转发需要更改为自己的秘钥和订单

注意每个 Request请求 meta 的传递方式

发表评论 / Comment

用心评论~