2015-04-16 79 views
1

在python中,我如何可以减少timedelta邻居的日期时间列表?减少日期时间列表由timedelta

如果我有

dates = [ 
     dt.datetime(1970, 1, 1, 0, 2), 
     dt.datetime(1970, 1, 1, 0, 3), 
     dt.datetime(1970, 1, 1, 0, 7), 
     dt.datetime(1970, 1, 1, 0, 8) 
    ] 

和timedelta

delta = dt.timedelta(minutes=2) 

我怎样才能得到呢?

expected = [ 
     dt.datetime(1970, 1, 1, 0, 2, 30), 
     dt.datetime(1970, 1, 1, 0, 7, 30) 
    ] 

编辑

以数字为例,如果我有这个号码列表

numbers = [1,2,6,7] 
delta = 1 

我试着组近值,并获得该组的特征值(中间值) 。增量是值之间的最大距离。

为数字,特征值是

[1.5, 6.5] 

由于值在[1,2]分组并[6,7]和计算出的平均值。

+0

只是要清楚,你的目标是通过初始列表运行和当前值的时间差内消除任何条目? – wnnmaw

+0

你的意思是什么意思timedelta neborhood? 在预期的情况下,您可以在第一个和第三个值上添加30秒。 – tgdn

+0

@tgdn一个邻域是一组近似值 – JuanPablo

回答

0
import datetime as dt 

dates = [ 
    dt.datetime(1970, 1, 1, 0, 2), 
    dt.datetime(1970, 1, 1, 0, 3), 
    dt.datetime(1970, 1, 1, 0, 12), 
    dt.datetime(1970, 1, 1, 0, 7), 
    dt.datetime(1970, 1, 1, 0, 8), 
    dt.datetime(1970, 1, 1, 0, 9), 
    dt.datetime(1970, 1, 1, 0, 13) 
] 

def group_dates(dates, delta): 
    it = iter(dates) 
    prev = next(it) 
    grouped, total = [[prev]], delta.total_seconds() 
    for dte in it: 
     if (dte - prev).total_seconds() <= total: 
      grouped[-1].append(dte) 
     else: 
      grouped.append([dte]) 
     prev = dte 
    return grouped 
def td(l): 
    seconds = sum((d - dt.datetime(1970, 1, 1)).total_seconds() for d in l)/len(l) 
    return dt.datetime.utcfromtimestamp(seconds) 


from pprint import pprint as pp 
pp([td(sub) for sub in group_dates(dates,dt.timedelta(minutes=2))]) 

为了避免不必要的函数调用,检查LEN:

pp([td(sub) if len(sub) > 1 else sub[0] for sub in [datetime.datetime(1970, 1, 1, 0, 2, 30), 
datetime.datetime(1970, 1, 1, 0, 12), 
datetime.datetime(1970, 1, 1, 0, 8), 
datetime.datetime(1970, 1, 1, 0, 13)]group_dates(dates,dt.timedelta(minutes=2))]) 

或屈服值,当您去:

def group_dates(dates, delta): 
    it = iter(dates) 
    prev = next(it) 
    grouped, total = (prev,),delta.total_seconds() 
    for dte in it: 
     if (dte - prev).total_seconds() <= total: 
      grouped = grouped + (dte,) 
     else: 
      yield td(grouped) 
      grouped = (dte,) 
     prev = dte 
    yield td(grouped) 

pp(list(group_dates(dates, delta=dt.timedelta(minutes=2)))) 
[datetime.datetime(1970, 1, 1, 0, 2, 30), 
datetime.datetime(1970, 1, 1, 0, 12), 
datetime.datetime(1970, 1, 1, 0, 8), 
datetime.datetime(1970, 1, 1, 0, 13)] 

一些计时:

In [28]: dates = [               
    dt.datetime(1970, 1, 1, 0, 2), 
    dt.datetime(1970, 1, 1, 0, 3), 
    dt.datetime(1970, 1, 1, 0, 4), 
    dt.datetime(1970, 1, 1, 0, 7), 
    dt.datetime(1970, 1, 1, 0, 8), 
    dt.datetime(1970, 1, 1, 0, 9), 
    dt.datetime(1970, 1, 1, 0, 15), 
    dt.datetime(1970, 1, 1, 0, 22), 
    dt.datetime(1970, 1, 1, 0, 24), 
    dt.datetime(1970, 1, 1, 0, 27) 
] 

In [41]: for i in range(10000):  
      dates.append(dates[-1]+dt.timedelta(minutes=choice([1,2,3,4]))) 
    ....:  
In [42]: timeit [td(sub) if len(sub) > 1 else sub[0] for sub in group_dates(dates,dt.timedelta(minutes=2))] 
100 loops, best of 3: 15.8 ms per loop 

In [43]: timeit reduce_datetime_list_by_delta(dates, delta)       
100 loops, best of 3: 16.9 ms per loop 

In [44]: timeit timestamps = map(avgtm, groupby(dates, key=grouper(delta))) 
10 loops, best of 3: 18.8 ms per loop 

In [45]: timeit (list(group_dates_iter(dates, delta = dt.timedelta(minutes=2)))) 
10 loops, best of 3: 18.4 ms per loop 
+0

http://ideone.com/aqqiKY – JuanPablo

+0

@JuanPablo,那是对的吗? –

+0

是的......但是发生了什么'dt.datetime(1970,1,1,0,12)'值?这个值应该在一个单独的组中 – JuanPablo

0
import datetime as dt 

def datetime_to_epoch(dtime): 
    return (dtime - dt.datetime(1970,1,1)).total_seconds() 

def datetime_sublists(datetime_list, time_delta = dt.timedelta(days=1)): 
    sublists = [] 

    temp = [datetime_list[0]] 
    for i in range(len(datetime_list)-1): 
     prev_date = datetime_list[i] 
     current_date = datetime_list[i+1] 

     if current_date - prev_date <= time_delta: 
      temp.append(current_date) 
     else: 
      sublists.append(temp) 
      temp = [current_date] 
    sublists.append(temp) 

    return sublists 

def reduce_datetime_list_by_delta(date_list, delta): 
    sublist = datetime_sublists(date_list, delta) 

    reduced = [] 
    for dates in sublist: 
     epochs = [ datetime_to_epoch(date) for date in dates] 
     epoch_average = sum(epochs)/len(epochs) 
     reduced.append(dt.datetime.utcfromtimestamp(epoch_average)) 

    return reduced 


dates = [ 
    dt.datetime(1970, 1, 1, 0, 2), 
    dt.datetime(1970, 1, 1, 0, 3), 
    dt.datetime(1970, 1, 1, 0, 7), 
    dt.datetime(1970, 1, 1, 0, 8), 
    dt.datetime(1970, 1, 1, 0, 12) 
] 

delta = dt.timedelta(minutes=2) 

print reduce_datetime_list_by_delta(dates, delta) 
+0

http://ideone.com/Pd6Gdn – JuanPablo

2

问题的描述已经给出了它扔掉:你想使用的groupby()功能从itertools

所有这一切需要的是一个稍微聪明key功能,一个是记得的最后一个状态,并不断给予同样的key值,只要因为连续的时间戳比delta更接近。

分组后,将找到的群组转换为平均次数,照顾单个时间戳(包含示例)。

import datetime as dt 
from itertools import groupby 

dates = [ 
     dt.datetime(1970, 1, 1, 0, 2), 
     dt.datetime(1970, 1, 1, 0, 3), 
     dt.datetime(1970, 1, 1, 0, 7), 
     dt.datetime(1970, 1, 1, 0, 8), 
     dt.datetime(1970, 1, 1, 0, 13) 
    ] 
delta = dt.timedelta(minutes=2) 

class grouper: 
    def __init__(self, delta): 
     self.delta= delta 
     self.last = None 

    def __call__(self, tm): 
     # we keep on returning the same key as long as successive time 
     # stamps are within the last time stamp + delta 
     self.last = tm if (self.last is None) or (tm - self.last)>self.delta \ 
         else self.last 
     return self.last 

# transform the result of groupby into average times 
def avgtm(item): 
    (key, tms) = item 
    tms = list(tms) # transform generator into list so we can index it 
    return tms[0] + (tms[-1]-tms[0])/2 if len(tms)>1 else tms[0] 

timestamps = map(avgtm, groupby(dates, key=grouper(delta))) 
print "Time stamps: ",timestamps 

息率输出:

Time stamps: [datetime.datetime(1970, 1, 1, 0, 2, 30), 
       datetime.datetime(1970, 1, 1, 0, 7, 30), 
       datetime.datetime(1970, 1, 1, 0, 13)] 
+1

在适当的比较器中使用'itertools.groupby'是我首先想到的,太。如果你使用'timestamps =(avgtm(list(tms))中的生成器表达式来代替'self.last is None',那么你可以简单地说'not self.last',并且'avgtm'coupld会被简化一下(_,tms)groupby(日期,键=石斑鱼(增量)))''而不是'map.' –

+0

优秀的建议,thx! – haavee