【原创】python 爬取wowo扣扣爬虫代码_python爬虫_学习笔记

【原创】python 爬取wowo扣扣爬虫代码

2019-09-05 14:46 0次 0 加入收藏

摘要： python 爬取wowo扣扣爬虫代码，直接上代码：

python 爬取wowo扣扣爬虫代码，直接上代码：

# -*- coding:utf-8 -*-
import logging
from bs4 import BeautifulSoup
import os
import sys
import traceback
import requests
import lxml
import time

reload(sys)
sys.setdefaultencoding('utf-8')

# 开启日志
# logging.basicConfig(level=logging.DEBUG)
req = requests.session()
headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}
# 当前年月日
today = time.strftime("%Y-%m-%d", time.localtime())

def reqUrl(url):
    headers2 = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
        'Referer': str(url),
        'Cookie': 'Hm_lvt_492109f03bd65de28452325006c4a53c=1565577251,1567156182; security_session_verify=508e0aee5bd8c09525ec38ae8971ce7b; Hm_lpvt_492109f03bd65de28452325006c4a53c=1567157669'
    }
    s = req.get(url,headers=headers2,timeout=5)
    return s.content



# 下载图片
def downloadsImg(filename,img,dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
    print "===" * 11 + "开始下载 --%0--"
    headers1 = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
        'Cookie':'Hm_lvt_492109f03bd65de28452325006c4a53c=1565577251,1567156182; Hm_lpvt_492109f03bd65de28452325006c4a53c=1567160487'
    }

    response = req.get(img,timeout=20,headers=headers1)
    # 获取的文本实际上是图片的二进制文本
    img = response.content
    # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
    with open(dir + filename, 'wb') as f:
        f.write(img)
        print dir + filename
        print "===" * 11 + "已完成 --100%-- "


# 获取内容列表
def getListContent(url,thumb):
    soup = BeautifulSoup(reqUrl(url),'lxml')
    content = soup.find("div",class_="artCon")
    # 获取标题
    title = soup.find("h2",class_="artTit").text
    print title
    imgs = content.find_all("img")
    for ig in imgs:
        try:
            img = ig.get("src")
            print img
            filename = str(img).split("/")[-1]

            downloadsImg(filename, img, 'e:/usr/wowoqq/' + str(today) + '/' + title + '/')
        except:
            traceback.print_exc()


# 获取详情链接和缩略图
def getUrl(url):
    soup = BeautifulSoup(reqUrl(url), 'lxml')
    link = soup.find_all("a",class_="img")
    for lk in link:
        try:
            url = "https://www.wowoqq.com" + lk.get("href")
            thumb = lk.find("img").get("src")
            print url
            getListContent(url, thumb)
        except:
            print "获取详情链接和缩略图失败"
            sys.exit(0)

if __name__ == '__main__':
    for i in xrange(1,4):
        try:
            url = "https://www.wowoqq.com/tupian/beiying/index_"+str(i)+".html"
            getUrl(url)
            # 关闭数据库连接

        except:
            traceback.print_exc()

结果如下图所示：