【原创】python爬取win4000实例_python爬虫_学习笔记

【原创】python爬取win4000实例

2019-03-18 14:26 0次 1 加入收藏

摘要： 今天使用python写了一个爬虫，来爬去win4000.com,python版本是2.7，字也懒得打那么多，直接贴代码了

今天使用python写了一个爬虫，来爬去win4000.com,python版本是2.7，字也懒得打那么多，直接贴代码了：

# -*- coding:utf-8 -*-
import logging
from bs4 import BeautifulSoup
import os
import MySQLdb
import re
import sys
import traceback
import requests
import lxml

reload(sys)
sys.setdefaultencoding('utf-8')

conn = MySQLdb.connect(user="root",passwd="root",host="localhost",charset="utf8",db="pic")
cur = conn.cursor()

# logging.basicConfig(level=logging.DEBUG)

headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}

def request(url):
    s = requests.get(url,headers=headers,timeout=5)
    return s.content

# 入库
def inputSql(sql,data):
    try:
        # 执行sql语句
        cur.execute(sql,data)
        # 提交到数据库执行
        conn.commit()
    except:
        # 发生错误时回滚
        conn.rollback()


def inputThumb(title,source):
    sql = "INSERT INTO thumb_list(title,status, source) VALUES (%s, %s, %s)"
    data = [title, 0, source]
    inputSql(sql, data)

# 获取分页详细内容
def getPageList(url,page,title,cid):
    pre_url = str(url).replace(".html","")
    for p in xrange(1,int(page)+1):
        try:
            urls = pre_url + "_" + str(p) + ".html"
            # 获取大图标题
            title_con = title + "-第" + str(p) + "张"
            # 获取分类名称
            soup = BeautifulSoup(request(urls),'lxml')
            tname = soup.find("div",class_="breadcrumbs").find_all("a")
            for t in tname:
                tname = t.text
            # 获取描述
            description = str(soup).split("\" name=\"description\"/>")[0].split("keywords\"/>")[-1].replace("<meta content=\"","").strip().split("。")[0]
            # 获取标签
            tags = soup.find('div',class_="label").find_all("a")
            tags_text = ""
            for tg in tags:
                tags_text += tg.text + ","
            tags = str(tags_text)[:-1]
            # 大图
            img = soup.find("img",class_="pic-large")
            img = img.get("src")
            # 后缀
            ext = str(img).split('.')[-1]
            # 图片尺寸
            imgSize = soup.find("span",class_="size").find("em")
            imgSize = imgSize.text

            # SQL 插入语句
            sql = "INSERT INTO img_list(title, cid, tags, source, ext, tname, imgSize, description,status) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
            data = [title_con, cid, tags, img, ext, tname , str(imgSize), description, 0]
            inputSql(sql,data)
            print "正在入库：","大图ID： ", int(cur.lastrowid) ,"  缩略图：" , cid ,"  标题：" , title, " 大图链接：" ,img

        except:
            traceback.print_exc()
            sys.exit(0)

# 获取内容列表
def getListContent(url,thumb):
    soup = BeautifulSoup(request(url),'lxml')
    pageTitle = soup.find("div",class_="ptitle")
    # 获取图片页码
    page = pageTitle.find("em").text
    # 缩略图标题
    title = pageTitle.find("h1").text
    inputThumb(title, thumb)
    cid = int(cur.lastrowid)
    getPageList(url,page,title,cid)


# 获取详情链接和缩略图
def getUrl(url):
    soup = BeautifulSoup(request(url), 'lxml')
    link = soup.find("div", class_="tab_box").find_all("a")
    for lk in link:
        try:
            url = lk.get("href")
            thumb = lk.find("img").get("data-original")
            getListContent(url, thumb)
        except:
            print "获取详情链接和缩略图失败"
            sys.exit(0)

if __name__ == '__main__':
    url = "http://www.win4000.com/mobile_0_0_0_1.html"
    getUrl(url)
    # 关闭数据库连接
    conn.close()

微信图片_20190318141757.png

代码运行直接把链接存入到数据库，如需下载代码请联系17762131；创建数据表sql语句如下：

CREATE TABLE `img_list` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL COMMENT '标题',
  `cid` int(11) DEFAULT NULL,
  `tags` varchar(255) DEFAULT NULL COMMENT '标签',
  `source` varchar(255) DEFAULT NULL COMMENT '图片源地址',
  `ext` varchar(32) DEFAULT NULL COMMENT '资源后缀',
  `file` varchar(55) DEFAULT NULL COMMENT '本地图片地址',
  `tname` varchar(5) DEFAULT NULL COMMENT '分类',
  `imgSize` varchar(255) DEFAULT NULL COMMENT '大图尺寸',
  `description` varchar(255) DEFAULT NULL COMMENT '描述',
  `status` varchar(32) DEFAULT NULL COMMENT '状态',
  PRIMARY KEY (`id`),
  UNIQUE KEY `source` (`source`),
  KEY `cid` (`cid`)
) ENGINE=MyISAM AUTO_INCREMENT=249 DEFAULT CHARSET=utf8;

CREATE TABLE `thumb_list` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL COMMENT '标题',
  `status` varchar(32) DEFAULT NULL COMMENT '状态',
  `source` varchar(100) DEFAULT NULL COMMENT '缩略图源地址',
  `file` varchar(255) DEFAULT NULL COMMENT '本地地址',
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=25 DEFAULT CHARSET=utf8;

这偏采集代码比较简单，当然如果需要讨论更多关于爬虫的知识，请添加底部的站长交流群，未经本人同意，禁止转载

本文由帝一博客原创发布。用户在本站发布的原创内容（包括但不仅限于回答、文章和评论），著作权均归用户本人所有。独家文章转载，请联系邮箱：17762131@qq.com。获得授权后，须注明本文地址： https://bubukou.com/pythonpachong/342.html