页面元素检测脚本

我们经常遇到某个页面缺少一个js之类的情况,一般使用firefox的httpfox或者chrome自带的工具都可以查看那些元素有问题.用python写了一个脚本,分析页面里面引用的元素,然后逐个进行请求,查看是否有的元素不能成功获取.不过使用urllib2有的地方异常处理也做的不完善,目前只是自己简单测试了一些,当熟练一下Python的用法了.目前从perl切换到python还是有很多的地方不是太习惯.尤其是很多列表和字符串的操作有点不一样.

  
#!/usr/bin/env python  
import urllib2  
import gzip  
import binascii  
import re,sys  
import string  
from StringIO import StringIO  
def gunziptxt(data):  
buf = StringIO(data)  
of =gzip.GzipFile(fileobj=buf,mode="rb")  
outdata=of.read()  
return outdata  
def http_code(url):  
request=urllib2.Request(url,headers={‘User-agent’:"python urllib browser","Accept-Encoding":’gzip’})  
try:  
response=urllib2.urlopen(request,timeout=5)  
return response.getcode()  
except  urllib2.HTTPError,error:  
print "url:",error.reason  
return error.code  
except urllib2.URLError,error:  
print url,error.reason  
return 1000  
def http_client(url):  
request=urllib2.Request(url,headers={‘User-agent’:"python urllib browser","Accept-Encoding":’gzip’})  
try:  
response=urllib2.urlopen(request,timeout=5)  
info=response.info()  
data=response.read()  
except urllib2.HTTPError,error:  
print "%s error:%s" %(url,error.reason)  
return None  
except urllib2.URLError,error:  
print error.reason  
return None

if info.get("content-encoding",None) == ‘gzip’:  
outdata=gunziptxt(data)  
else:  
outdata=data  
return outdata  
def get_src(page):  
src_re=re.compile(r’src\s*=\s*["|\’]\s*(https?://[^\"\’]+?)["|\’]’)  
if page:  
link_urls=src_re.findall(page)  
return set(link_urls)  
else:  
return set()  
if len(sys.argv)<2:  
print "usage:\n\t",sys.argv[0],"url"  
exit(1)  
if __name__ == "__main__":  
urls=sys.argv[1]  
pages=http_client(urls)  
if pages:  
links=get_src(pages)  
else:  
exit(1)  
for link in links:  
code=http_code(link)  
if code >399:  
print "%s \x1B[1;31m%d\x1B[m"%(link,code)  
else:  
print "%s \x1B[1;32m%d\x1B[m"%(link,code)  
else:  
print "pagecheck test"

用法比较简单:
python pagecheck.py ‘http://www.baidu.com’