本文共 2534 字,大约阅读时间需要 8 分钟。
以前写过一些python,主要都是调用linux命令和封装完成命令去做watchdog监控,好像还没用到python去爬网页,于是写一个吧,不如就爬下公司内网的征婚的mm的照片吧,说干就干吧,代码如下:
#! -*- coding:utf-8 -*-import urllib2import urllibimport cookielibimport osimport re# 登录信息data = {'actionFlag':"loginAuthenticate", "lang":"en", "loginMethod": "login", 'loginPageType':'mix', "redirect":"http%3A%2F%2Fxinsheng.huawei.com", 'uid':'coder_xia', 'password':'xxxxx' }postdata = urllib.urlencode(data)#模拟浏览器信息headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.56 Safari/537.17'}# 登录地址url_login = "https://login.xx.com/login/login.do"#深圳url_xinsheng = "http://xinsheng.xx.com/cn/index.php?app=forum&mod=List&act=index&class=409&cate=44"# 登录def login(url_login): #cookie cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) response = urllib2.Request(url_login, postdata,headers) res = opener.open(response).read() regex = re.compile(r"欢迎") resultreg = re.compile(regex); if len(re.findall(resultreg,res))!=0: print "login success" else: print "login fail"def getHtml(url): page = urllib2.urlopen(url) html = page.read() return html#返回图片url实际地址def getAllPictureLink(html): reg = r'lazyload="(.+?\.jpg)"' imgre = re.compile(reg) imglist = re.findall(imgre,html) return imglist #返回征婚相关连接def getAllLink(html): reg = r'征.*?"(http://.*?mod=Detail.*?[1-9])"' linkre = re.compile(reg) linklist = re.findall(linkre, html) return linklist#保存图片到本地def savePicture(pic_name,url): of = open(os.path.join("F:\\xinsheng\\hangzhou", pic_name.split("-")[2]+"-"+pic_name.split("-")[3]), 'w+b') q = urllib.urlopen(url) of.write(q.read()) q.close() of.close()def downloadPitureInURL(url): html = getHtml(url) linklist = getAllLink(html) for url in linklist: print 'url = ' + url; pic_list = getAllPictureLink(getHtml(url)); for img_url in pic_list: print img_url imgarr = img_url.split("/"); if len(imgarr) != 6 or len(imgarr[5].split("-")) != 4: continue savePicture(imgarr[5],img_url)login(url_login);downloadPitureInURL(url_xinsheng) for i in range(2,31): url_every_page = url_xinsheng+"&p="+str(i); print "url_every_page = " + url_every_page; downloadPitureInURL(url_every_page)参考网址:http://www.cnblogs.com/sysu-blackbear/p/3629770.html