二分法查找指定时间的日志

二分法是很基础的一个查询方法。试想一个场景,应用的访问量非常大,单天的日志单个文件上100G,要准实时地统计出TPM的大小。没有什么storm之类的高级玩意,就自己写脚本进行统计的话其实不太好搞。这个时候可以试试每次用二分法找出上一分钟的日志所在的偏移量,然后再顺序读入日志进行处理,可以比较高效地跳过大量的日志。python简单写了个

  
#!/usr/bin/env python  
import re  
import datetime  
import sys  
class logtools:  
"""  
this tools can get the bind qps and the ips which query with high frequency  
"""  
def __init__(self,filename="/xx/acess.log"):  
self.logname=filename  
try:  
#print "logs is",filename  
self.fd=open(filename,"r")  
except IOError:  
print "open log failed"  
sys.exit(1)  
def __del__(self):  
try:  
self.fd.close()  
except:  
print "close fd failed"  
def get_last_min(self):  
now=datetime.datetime.now()  
last=datetime.datetime.now()+datetime.timedelta(minutes=-2)  
qps_time=datetime.datetime.now()+datetime.timedelta(minutes=-1)  
t=qps_time.strftime(\s+%H:%M:)  
t2=qps_time.strftime(%H:%M)  
return (int(last.strftime("%s")),t,t2)  
def get_current_min(self):  
time_reg=re.compile("\s+(?P<hour>\d+):(?P<min>\d+):(?P<sec>\d+)")  
now=datetime.datetime.now()  
i=1  
while True:  
line=self.fd.readline()  
if not  line:  
return None  
match=time_reg.search(line)  
i=i+1  
if  match:  
match_time=datetime.datetime(year=now.year,month=now.month,day=now.day,  
hour=int(match.group("hour")),  
minute=int(match.group("min")),  
second=int(match.group("sec")),  
)  
break  
return int(match_time.strftime("%s"))  
def get_last_seek(self,last_time):  
old_seek=self.fd.tell()  
self.fd.seek(0,0)  
start_seek=self.fd.tell()  
start_time=self.get_current_min()  
pos_off=len(self.fd.readline())*2  
self.fd.seek(0,2)  
end_seek=self.fd.tell()  
self.fd.seek(-pos_off,2)  
end_time=self.get_current_min()  
#print "time range:",start_time,last_time,end_time  
#print "pos_off:",pos_off  
if last_time < start_time:  
print "error last-time <start-time"  
return end_seek  
elif  last_time > end_time:  
print "error %d > %d"%(last_time,end_time)  
return end_seek  
time=0  
while (end_seek  start_seek > 2*pos_off and end_time  start_time > 3) :  
half_seek=int((end_seek+start_seek)/2)  
self.fd.seek(half_seek,0)  
half_time=self.get_current_min()  
#print "%d –<%d>—%d"%(start_seek,half_seek,end_seek)  
if last_time<=half_time:  
end_seek=half_seek  
self.fd.seek(end_seek,0)  
end_time=self.get_current_min()  
else:  
start_seek=half_seek  
self.fd.seek(start_seek,0)  
start_time=self.get_current_min()  
time+=1  
#print "search %d times"%time  
return half_seek  
def get_tpm(self):  
reg=self.get_last_min()[1]+"\d{2}"  
reg_time=self.get_last_min()[2]  
regex=re.compile(reg)  
time_pre=self.get_last_min()[0]  
pos=self.get_last_seek(time_pre)  
self.fd.seek(pos,0)  
query=0  
line=self.fd.readline()  
while line:  
if line == None:  
break  
elif regex.search(line):  
query+=1  
line=self.fd.readline()  
print "%s qps %d"%(str(reg_time),query)  
a=logtools(filename=sys.argv[1])  
a.get_tpm()