使用了多线程但没有处理同步。
正则表达式是用kodos弄出来的,这个工具使用起来比较顺手。
正则表达式是用kodos弄出来的,这个工具使用起来比较顺手。
- #!/usr/bin/env python
- #coding=cp936
- import re
- import urllib
- import thread
- import time
- #url_part1 = '''http://images.google.cn/images?q='''
- #url_part2 = '''&svnum=10&hl=zh-CN&lr=&newwindow=1&start='''
- #url_part3 = '''&sa=N'''
- url_part1 = '''http://image.baidu.com/i?z=0&cl=2&ct=201326592&sn=&lm=-1&cm=1&sc=0&bu=&rn=16&tn=baiduimage&word='''
- url_part2 = '''&pn='''
- def nextpage(keywords,start):
- return url_part1 + urllib.quote(keywords) + url_part2 + str(start)
- #compile_obj = re.compile(r'dyn.Img\("http://(.)+","","","')
- compile_obj = re.compile(r'<a href="http://image.baidu.com/ir\?t=1&u=http://(.)+&f=http://')
- def parseurl(content):
- ret = []
- match_objs = compile_obj.finditer(content)
- for match in match_objs:
- url = match.group().split('&')[1][2:]
- ret.append(url)
- return ret
- #exitdic = {}
- def downimg(url,exitdic):
- filename = url.split('/')[-1]
- #print url,'---->',filename
- urllib.urlretrieve(url,filename)
- exitdic[url].acquire()
- def download(url):
- exitdic = {}
- print '开始下载页面: ' + url
- conn = urllib.urlopen(url)
- content = conn.read()
- conn.close()
- urls = parseurl(content)
- for url in urls:
- print url
- exitdic[url] = thread.allocate_lock()
- thread.start_new(downimg,(url,exitdic))
- for key in exitdic.keys():
- while not exitdic[key].locked():
- #time.sleep(200)
- pass
- print '完成一页'
- #抓取关键字为vista的图片,抓取范围为查询结果的第4到8页
- if __name__ == '__main__':
- for i in range(4,8):
- url = nextpage('vista',i*16)
- download(url)
- print 'OK'
#!/usr/bin/env python
#coding=cp936
import re
import urllib
import thread
import time
#url_part1 = '''http://images.google.cn/images?q='''
#url_part2 = '''&svnum=10&hl=zh-CN&lr=&newwindow=1&start='''
#url_part3 = '''&sa=N'''
url_part1 = '''http://image.baidu.com/i?z=0&cl=2&ct=201326592&sn=&lm=-1&cm=1&sc=0&bu=&rn=16&tn=baiduimage&word='''
url_part2 = '''&pn='''
def nextpage(keywords,start):
return url_part1 + urllib.quote(keywords) + url_part2 + str(start)
#compile_obj = re.compile(r'dyn.Img\("http://(.)+","","","')
compile_obj = re.compile(r'<a href="http://image.baidu.com/ir\?t=1&u=http://(.)+&f=http://')
def parseurl(content):
ret = []
match_objs = compile_obj.finditer(content)
for match in match_objs:
url = match.group().split('&')[1][2:]
ret.append(url)
return ret
#exitdic = {}
def downimg(url,exitdic):
filename = url.split('/')[-1]
#print url,'---->',filename
urllib.urlretrieve(url,filename)
exitdic[url].acquire()
def download(url):
exitdic = {}
print '开始下载页面: ' + url
conn = urllib.urlopen(url)
content = conn.read()
conn.close()
urls = parseurl(content)
for url in urls:
print url
exitdic[url] = thread.allocate_lock()
thread.start_new(downimg,(url,exitdic))
for key in exitdic.keys():
while not exitdic[key].locked():
#time.sleep(200)
pass
print '完成一页'
#抓取关键字为vista的图片,抓取范围为查询结果的第4到8页
if __name__ == '__main__':
for i in range(4,8):
url = nextpage('vista',i*16)
download(url)
print 'OK'
No comments:
Post a Comment