Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions src/opensignals/data/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@
class Provider(ABC):
"""Common base class for (daily) stock price data"""

numerai_ticker_col = 'bloomberg_ticker'

def __init__(self):
# once round 665 opens on 2024-01-23, bloomberg_ticker will be renamed to numerai_ticker
target_time = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc)
if dt.datetime.now(dt.timezone.utc) >= target_time:
Provider.numerai_ticker_col = 'numerai_ticker'
else:
Provider.numerai_ticker_col = 'bloomberg_ticker'

@staticmethod
def get_tickers() -> pd.DataFrame:
ticker_map = pd.read_csv(SIGNALS_TICKER_MAP)
Expand All @@ -34,22 +44,22 @@ def get_tickers() -> pd.DataFrame:
num = ticker_map["yahoo"].duplicated().values.sum()
raise Exception(f'Found duplicated {num} yahoo tickers')

if ticker_map['bloomberg_ticker'].duplicated().any():
num = ticker_map["bloomberg_ticker"].duplicated().values.sum()
raise Exception(f'Found duplicated {num} bloomberg_ticker tickers')
if ticker_map[Provider.numerai_ticker_col].duplicated().any():
num = ticker_map[Provider.numerai_ticker_col].duplicated().values.sum()
raise Exception(f'Found duplicated {num} numerai_ticker tickers')

return ticker_map

@staticmethod
def get_ticker_data(db_dir: pathlib.Path) -> pd.DataFrame:
ticker_data = pd.DataFrame({
'bloomberg_ticker': pd.Series([], dtype='str'),
Provider.numerai_ticker_col: pd.Series([], dtype='str'),
'date': pd.Series([], dtype='datetime64[ns]')
})
if len(list(db_dir.rglob('*.parquet'))) > 0:
ticker_data = pd.read_parquet(db_dir)

num = ticker_data.bloomberg_ticker.unique().shape[0]
num = ticker_data[Provider.numerai_ticker_col].unique().shape[0]
logger.info(f'Retrieving data for {num} tickers from the database')

return ticker_data
Expand All @@ -60,17 +70,17 @@ def get_ticker_missing(ticker_data: pd.DataFrame,
last_friday: Optional[dt.datetime] = None) -> pd.DataFrame:
if last_friday is None:
last_friday = dt.datetime.today() - relativedelta(weekday=FR(-1))
tickers_available_data = ticker_data.groupby('bloomberg_ticker').agg({'date': [max, min]})
tickers_available_data = ticker_data.groupby(Provider.numerai_ticker_col).agg({'date': [max, min]})
tickers_available_data.columns = ['date_max', 'date_min']

eligible_tickers_available_data = ticker_map.merge(
tickers_available_data.reset_index(),
on='bloomberg_ticker',
on=Provider.numerai_ticker_col,
how='left'
)

ticker_not_found = eligible_tickers_available_data.loc[
eligible_tickers_available_data.date_max.isna(), ['bloomberg_ticker', 'yahoo']
eligible_tickers_available_data.date_max.isna(), [Provider.numerai_ticker_col, 'yahoo']
]

ticker_not_found['start'] = '2002-12-01'
Expand All @@ -81,7 +91,7 @@ def get_ticker_missing(ticker_data: pd.DataFrame,
(eligible_tickers_available_data.date_max < last_friday.strftime('%Y-%m-%d')) &
(eligible_tickers_available_data.date_max > last_friday_52.strftime('%Y-%m-%d'))
),
['bloomberg_ticker', 'yahoo', 'date_max']
[Provider.numerai_ticker_col, 'yahoo', 'date_max']
]

tickers_outdated['start'] = (
Expand All @@ -103,7 +113,7 @@ def get_live_data(ticker_data: pd.DataFrame, last_friday: dt.date) -> pd.DataFra

# Only select tickers than aren't already present in live_data
thursday_data = thursday_data[
~thursday_data.bloomberg_ticker.isin(live_data.bloomberg_ticker.values)
~thursday_data[Provider.numerai_ticker_col].isin(live_data[Provider.numerai_ticker_col].values)
].copy()

live_data = pd.concat([live_data, thursday_data])
Expand All @@ -117,7 +127,7 @@ def get_train_test_data(ticker_data: pd.DataFrame,
"""merge our feature data with Numerai targets"""
ml_data = pd.merge(
ticker_data, targets,
on=['date', 'bloomberg_ticker'],
on=['date', Provider.numerai_ticker_col],
how='left'
)

Expand Down Expand Up @@ -229,8 +239,8 @@ def download_data(self, db_dir: pathlib.Path, recreate: bool = False) -> None:

temp_df['created_at'] = dt.datetime.now()
temp_df['volume'] = temp_df['volume'].astype('float64')
temp_df['bloomberg_ticker'] = temp_df['bloomberg_ticker'].map(
dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker'])))
temp_df[Provider.numerai_ticker_col] = temp_df[Provider.numerai_ticker_col].map(
dict(zip(ticker_map['yahoo'], ticker_map[Provider.numerai_ticker_col])))

concat_dfs.append(temp_df)

Expand All @@ -239,7 +249,7 @@ def download_data(self, db_dir: pathlib.Path, recreate: bool = False) -> None:
return

df = pd.concat(concat_dfs)
n_ticker_data = df.bloomberg_ticker.unique().shape[0]
n_ticker_data = df[Provider.numerai_ticker_col].unique().shape[0]
if n_ticker_data <= 0:
logger.info('Dataset up to date')
return
Expand Down
4 changes: 2 additions & 2 deletions src/opensignals/data/yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def download_ticker(self, ticker: str, start: dt.datetime, end: dt.datetime) ->

def empty_df() -> pd.DataFrame:
return pd.DataFrame(columns=[
"date", "bloomberg_ticker",
"date", Provider.numerai_ticker_col,
"open", "high", "low", "close",
"adj_close", "volume", "currency", "provider"])

Expand Down Expand Up @@ -61,7 +61,7 @@ def empty_df() -> pd.DataFrame:

df = pd.DataFrame({
"date": pd.to_datetime(timestamps, unit="s").normalize(),
"bloomberg_ticker": ticker,
Provider.numerai_ticker_col: ticker,
"open": np.array(opens, dtype='float32'),
"high": np.array(highs, dtype='float32'),
"low": np.array(lows, dtype='float32'),
Expand Down
26 changes: 20 additions & 6 deletions src/opensignals/features.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import datetime as dt
import logging
from abc import ABC, abstractmethod
from typing import List, Optional, Protocol, Dict, Tuple, Union

import numpy as np
Expand All @@ -7,7 +9,19 @@
logger = logging.getLogger(__name__)


class FeatureGenerator(Protocol):
class FeatureGenerator(ABC):

numerai_ticker_col = 'bloomberg_ticker'

def __init__(self):
# once round 665 opens on 2024-01-23, bloomberg_ticker will be renamed to numerai_ticker
target_time = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc)
if dt.datetime.now(dt.timezone.utc) >= target_time:
FeatureGenerator.numerai_ticker_col = 'numerai_ticker'
else:
FeatureGenerator.numerai_ticker_col = 'bloomberg_ticker'

@abstractmethod
def generate_features(self, ticker_data: pd.DataFrame, feature_prefix: Optional[str] = None) -> Tuple[pd.DataFrame, List[str]]:
pass

Expand All @@ -27,7 +41,7 @@ def generate_features(self,
if feature_prefix:
feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}'

ticker_groups = ticker_data.groupby('bloomberg_ticker')
ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col)
ticker_data[feature_prefix_name] = \
ticker_groups[self.variable].transform(
lambda x: x.shift(self.num_days))
Expand Down Expand Up @@ -103,7 +117,7 @@ def generate_features(self,
if feature_prefix:
feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}'

ticker_groups = ticker_data.groupby('bloomberg_ticker')
ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col)
ticker_data[feature_prefix_name] = \
ticker_groups[self.variable].transform(
lambda x: self.relative_strength_index(x, self.interval)
Expand All @@ -129,7 +143,7 @@ def generate_features(self,

# create lagged features grouped by ticker
logger.debug('grouping by ticker...')
ticker_groups = ticker_data.groupby('bloomberg_ticker')
ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col)

# lag 0 is that day's value, lag 1 is yesterday's value, etc
logger.debug('generating lagged RSI quintiles...')
Expand Down Expand Up @@ -192,7 +206,7 @@ def generate_features(self,
if feature_prefix:
feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}'

ticker_groups = ticker_data.groupby('bloomberg_ticker')
ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col)
ticker_data[feature_prefix_name] = \
ticker_groups[self.variable].transform(
lambda x: self.simple_moving_average(x, self.interval)
Expand All @@ -218,7 +232,7 @@ def generate_features(self,

# create lagged features grouped by ticker
logger.debug('grouping by ticker...')
ticker_groups = ticker_data.groupby('bloomberg_ticker')
ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col)

# lag 0 is that day's value, lag 1 is yesterday's value, etc
logger.debug('generating lagged SMA quintiles...')
Expand Down
Empty file added tests/__init__.py
Empty file.
Empty file added tests/integration/__init__.py
Empty file.
46 changes: 46 additions & 0 deletions tests/integration/test_data_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import unittest
from pathlib import Path

from opensignals.data.yahoo import Yahoo
from opensignals.features import RSI, SMA


class TestDataPipeline(unittest.TestCase):

def test_yahoo_rsi_data(self):
db_dir = Path('db')

yahoo = Yahoo()
yahoo.download_data(db_dir)

features_generators = [
RSI(num_days=5, interval=14, variable='adj_close'),
RSI(num_days=5, interval=21, variable='adj_close'),
SMA(num_days=5, interval=14, variable='adj_close'),
SMA(num_days=5, interval=21, variable='adj_close'),
]

train, test, live, feature_names = yahoo.get_data(db_dir,
features_generators=features_generators,
feature_prefix='feature')

# check that all the features are in each dataset
for df in [train, test, live]:
for feature_name in feature_names:
self.assertTrue(feature_name in df.columns)

# live dataset should be at least 3000 tickers
self.assertTrue(live.shape[0] > 3000)

# training dataset should be 859750 rows
self.assertTrue(train.shape[0] >= 859750)

# test dataset should be at least 1830145 rows
self.assertTrue(test.shape[0] >= 1830145)

# TODO: what assertions for data?
# feature_names are expected...
# live df > 3000 rows?
# training df should always be same length (859750)
# test df should be at least (1830145)
print(train)
Empty file added tests/unit/__init__.py
Empty file.
34 changes: 34 additions & 0 deletions tests/unit/test_feature_generator_numerai_ticker_col.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
from unittest.mock import patch
import datetime as dt
from opensignals.features import RSI


class TestFeatureGeneratorNumeraiTickerCol(unittest.TestCase):

@patch('opensignals.features.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_before_datetime(self, mock_datetime):
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 12, 0, 0, tzinfo=dt.timezone.utc)

rsi = RSI()
self.assertEqual(rsi.numerai_ticker_col, 'bloomberg_ticker')

@patch('opensignals.features.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_on_datetime(self, mock_datetime):
# Set the current time to exactly 2024-01-23 13:00:00 UTC
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc)

rsi = RSI()
self.assertEqual(rsi.numerai_ticker_col, 'numerai_ticker')

@patch('opensignals.features.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_after_datetime(self, mock_datetime):
# Set the current time to after 2024-01-23 13:00:00 UTC
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 14, 0, 0, tzinfo=dt.timezone.utc)

rsi = RSI()
self.assertEqual(rsi.numerai_ticker_col, 'numerai_ticker')


if __name__ == '__main__':
unittest.main()
34 changes: 34 additions & 0 deletions tests/unit/test_provider_numerai_ticker_col.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
from unittest.mock import patch
import datetime as dt
from opensignals.data.yahoo import Yahoo


class TestProviderNumeraiTickerCol(unittest.TestCase):

@patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_before_datetime(self, mock_datetime):
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 12, 0, 0, tzinfo=dt.timezone.utc)

provider = Yahoo()
self.assertEqual(provider.numerai_ticker_col, 'bloomberg_ticker')

@patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_on_datetime(self, mock_datetime):
# Set the current time to exactly 2024-01-23 13:00:00 UTC
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc)

provider = Yahoo()
self.assertEqual(provider.numerai_ticker_col, 'numerai_ticker')

@patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime)
def test_numerai_ticker_col_after_datetime(self, mock_datetime):
# Set the current time to after 2024-01-23 13:00:00 UTC
mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 14, 0, 0, tzinfo=dt.timezone.utc)

provider = Yahoo()
self.assertEqual(provider.numerai_ticker_col, 'numerai_ticker')


if __name__ == '__main__':
unittest.main()