Skip to content

add support for DatetimeIndexer #45

Closed
@jreback

Description

@jreback
from pandas.api.indexers import BaseIndexer

def calculate_variable_window_bounds(left, right, index):

    num_values = len(index)
    assert len(left) == len(right) == len(index)
    
    start = np.empty(num_values, dtype='int64')
    start.fill(-1)
    end = np.empty(num_values, dtype='int64')
    end.fill(-1)

    # initial conditions
    if index[0] > left[0]:
        start[0] = 0
    if index[0] <= right[0]:
        end[0] = 1

    #import pdb; pdb.set_trace()
    for i in range(1, num_values):

        value = index[i]
        
        # advance the start bound until we are
        # within the constraint
        start[i] = start[i - 1]
        for j in range(start[i - 1], i):
            # if we are no longer in the right bounds
            if value > right[j]:
                start[i] = j
                #break
            elif value < left[j]:
                start[i] = j
            else:
                break

        # end bound is previous end
        # or current index
        if (index[end[i - 1]] - right[i]) <= 0:
            end[i] = i + 1
        else:
            end[i] = end[i - 1]
        
    print(start, end)
    return start, end

class DatetimeIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        # starts, ends, points are all DTI
        starts = np.asarray(self.starts.view('i8'))
        ends = np.asarray(self.ends.view('i8'))
        points = np.asarray(self.points.view('i8'))
        return calculate_variable_window_bounds(starts, ends, points)

Input frame

tweets_str = """
             ticker,datetime,sentiment
             GOOG,2020-05-27 15:00,0.6
             GOOG,2020-05-28 11:00,0.5
             IBM,2020-05-28 12:00,-0.1
             GOOG,2020-05-28 13:00,0.2
             GOOG,2020-05-28 20:00,0.3
             GOOG,2020-05-29 07:00,-0.1
             IBM,2020-05-29 09:00,-0.3
             IBM,2020-05-29 12:00,-0.4
             GOOG,2020-05-30 07:00,-0.2
             GOOG,2020-05-30 08:00,-0.5
             GOOG,2020-05-30 10:00,0.1
             GOOG,2020-05-30 14:00,0.3
             GOOG,2020-05-31 07:00,-0.1
             GOOG,2020-06-01 08:00,0.2
             GOOG,2020-06-01 10:00,0.4
             """
tweets = pd.read_csv(StringIO(dedent(tweets_str)), parse_dates=["datetime"])

Call it like this

bd = 1 * pd.tseries.offsets.BusinessDay()
starts = tweets.datetime -1 * bd
ends = tweets.datetime -0 * bd
tweets.rolling(window=DatetimeIndexer(starts=starts, ends=ends, points=tweets.datetime)).sentiment.mean()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions