python3 爬虫教程-百问十七

python3 爬虫教程

的有关信息介绍如下：

python3 爬虫教程

这里介绍如何使用python进行图片获取的爬虫开发过程

在python文件中定义一个获取html文件流的函数

def getHtml(url):

page = urllib.urlopen(url) #urllib.urlopen()方法用于打开一个URL地址

html = page.read() #read()方法用于读取URL上的数据

return html

在python中定义在文件流中查找图像对象的函数，需要使用到正则表达式进行图像文件的匹配。

def getImgList(html):

print html

reg = r'http://.+?\.jpg'

imgre = re.compile(reg)

imglist = re.findall(imgre,html)

print "imglist:%s" %imglist

return imglist

定义获取到的图像文件下载到本地硬盘的函数，使用随机数生成文件名避免文件名冲突。

#将文件列表写入磁盘

def writeImgList(imglist,start):

x = start

for imgurl in imglist:

print "begin write the image %s" %imgurl

fileName = 'e:\jiandan\%s.jpg' %random.randint(100000, 9999999)

if os.path.exists(fileName):

fileName = 'e:\jiandan\%s.jpg' %random.randint(100000, 9999999)

#urllib.urlretrieve(imgurl,fileName)

urllib.urlretrieve(imgurl,'%s' %fileName)

x+=1

输入一个网址进行图像文件的查找并下载到本地

url = "http://www.moko.cc/channels/post/23/1.html"

imgList = getImgList(getHtml(url))

print imgList

writeImgList(imgList,0)

完整pytyon的代码如下：

#coding=utf-8

#urllib模块提供了读取Web页面数据的接口

import urllib

#re模块主要包含了正则表达式

import re

import random

import os

#定义一个getHtml()函数

def getHtml(url):

page = urllib.urlopen(url) #urllib.urlopen()方法用于打开一个URL地址

html = page.read() #read()方法用于读取URL上的数据

return html

#搜索图片文件

def getImgList(html):

print html

#http://wx2.sinaimg.cn/mw600/006GlaT2ly1fdaip4dnmpj30hq0qo0vl.jpg

reg = r'http://.+?\.jpg'

imgre = re.compile(reg)

imglist = re.findall(imgre,html)

print "imglist:%s" %imglist

return imglist

#将文件列表写入磁盘

def writeImgList(imglist,start):

x = start

for imgurl in imglist:

print "begin write the image %s" %imgurl

fileName = 'e:\jiandan\%s.jpg' %random.randint(100000, 9999999)

if os.path.exists(fileName):

fileName = 'e:\jiandan\%s.jpg' %random.randint(100000, 9999999)

#urllib.urlretrieve(imgurl,fileName)

urllib.urlretrieve(imgurl,'%s' %fileName)

x+=1

#url为搜索图片文件的网页地址

url = "http://www.moko.cc/channels/post/23/1.html"

imgList = getImgList(getHtml(url))

print imgList

writeImgList(imgList,0)

运行tupian.py文件进行图片查找并下载