diff --git a/src/opensignals/data/provider.py b/src/opensignals/data/provider.py index 2fb6278..dc12302 100644 --- a/src/opensignals/data/provider.py +++ b/src/opensignals/data/provider.py @@ -24,6 +24,16 @@ class Provider(ABC): """Common base class for (daily) stock price data""" + numerai_ticker_col = 'bloomberg_ticker' + + def __init__(self): + # once round 665 opens on 2024-01-23, bloomberg_ticker will be renamed to numerai_ticker + target_time = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc) + if dt.datetime.now(dt.timezone.utc) >= target_time: + Provider.numerai_ticker_col = 'numerai_ticker' + else: + Provider.numerai_ticker_col = 'bloomberg_ticker' + @staticmethod def get_tickers() -> pd.DataFrame: ticker_map = pd.read_csv(SIGNALS_TICKER_MAP) @@ -34,22 +44,22 @@ def get_tickers() -> pd.DataFrame: num = ticker_map["yahoo"].duplicated().values.sum() raise Exception(f'Found duplicated {num} yahoo tickers') - if ticker_map['bloomberg_ticker'].duplicated().any(): - num = ticker_map["bloomberg_ticker"].duplicated().values.sum() - raise Exception(f'Found duplicated {num} bloomberg_ticker tickers') + if ticker_map[Provider.numerai_ticker_col].duplicated().any(): + num = ticker_map[Provider.numerai_ticker_col].duplicated().values.sum() + raise Exception(f'Found duplicated {num} numerai_ticker tickers') return ticker_map @staticmethod def get_ticker_data(db_dir: pathlib.Path) -> pd.DataFrame: ticker_data = pd.DataFrame({ - 'bloomberg_ticker': pd.Series([], dtype='str'), + Provider.numerai_ticker_col: pd.Series([], dtype='str'), 'date': pd.Series([], dtype='datetime64[ns]') }) if len(list(db_dir.rglob('*.parquet'))) > 0: ticker_data = pd.read_parquet(db_dir) - num = ticker_data.bloomberg_ticker.unique().shape[0] + num = ticker_data[Provider.numerai_ticker_col].unique().shape[0] logger.info(f'Retrieving data for {num} tickers from the database') return ticker_data @@ -60,17 +70,17 @@ def get_ticker_missing(ticker_data: pd.DataFrame, last_friday: Optional[dt.datetime] = None) -> pd.DataFrame: if last_friday is None: last_friday = dt.datetime.today() - relativedelta(weekday=FR(-1)) - tickers_available_data = ticker_data.groupby('bloomberg_ticker').agg({'date': [max, min]}) + tickers_available_data = ticker_data.groupby(Provider.numerai_ticker_col).agg({'date': [max, min]}) tickers_available_data.columns = ['date_max', 'date_min'] eligible_tickers_available_data = ticker_map.merge( tickers_available_data.reset_index(), - on='bloomberg_ticker', + on=Provider.numerai_ticker_col, how='left' ) ticker_not_found = eligible_tickers_available_data.loc[ - eligible_tickers_available_data.date_max.isna(), ['bloomberg_ticker', 'yahoo'] + eligible_tickers_available_data.date_max.isna(), [Provider.numerai_ticker_col, 'yahoo'] ] ticker_not_found['start'] = '2002-12-01' @@ -81,7 +91,7 @@ def get_ticker_missing(ticker_data: pd.DataFrame, (eligible_tickers_available_data.date_max < last_friday.strftime('%Y-%m-%d')) & (eligible_tickers_available_data.date_max > last_friday_52.strftime('%Y-%m-%d')) ), - ['bloomberg_ticker', 'yahoo', 'date_max'] + [Provider.numerai_ticker_col, 'yahoo', 'date_max'] ] tickers_outdated['start'] = ( @@ -103,7 +113,7 @@ def get_live_data(ticker_data: pd.DataFrame, last_friday: dt.date) -> pd.DataFra # Only select tickers than aren't already present in live_data thursday_data = thursday_data[ - ~thursday_data.bloomberg_ticker.isin(live_data.bloomberg_ticker.values) + ~thursday_data[Provider.numerai_ticker_col].isin(live_data[Provider.numerai_ticker_col].values) ].copy() live_data = pd.concat([live_data, thursday_data]) @@ -117,7 +127,7 @@ def get_train_test_data(ticker_data: pd.DataFrame, """merge our feature data with Numerai targets""" ml_data = pd.merge( ticker_data, targets, - on=['date', 'bloomberg_ticker'], + on=['date', Provider.numerai_ticker_col], how='left' ) @@ -229,8 +239,8 @@ def download_data(self, db_dir: pathlib.Path, recreate: bool = False) -> None: temp_df['created_at'] = dt.datetime.now() temp_df['volume'] = temp_df['volume'].astype('float64') - temp_df['bloomberg_ticker'] = temp_df['bloomberg_ticker'].map( - dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker']))) + temp_df[Provider.numerai_ticker_col] = temp_df[Provider.numerai_ticker_col].map( + dict(zip(ticker_map['yahoo'], ticker_map[Provider.numerai_ticker_col]))) concat_dfs.append(temp_df) @@ -239,7 +249,7 @@ def download_data(self, db_dir: pathlib.Path, recreate: bool = False) -> None: return df = pd.concat(concat_dfs) - n_ticker_data = df.bloomberg_ticker.unique().shape[0] + n_ticker_data = df[Provider.numerai_ticker_col].unique().shape[0] if n_ticker_data <= 0: logger.info('Dataset up to date') return diff --git a/src/opensignals/data/yahoo.py b/src/opensignals/data/yahoo.py index 0c2b4fa..803cee6 100644 --- a/src/opensignals/data/yahoo.py +++ b/src/opensignals/data/yahoo.py @@ -19,7 +19,7 @@ def download_ticker(self, ticker: str, start: dt.datetime, end: dt.datetime) -> def empty_df() -> pd.DataFrame: return pd.DataFrame(columns=[ - "date", "bloomberg_ticker", + "date", Provider.numerai_ticker_col, "open", "high", "low", "close", "adj_close", "volume", "currency", "provider"]) @@ -61,7 +61,7 @@ def empty_df() -> pd.DataFrame: df = pd.DataFrame({ "date": pd.to_datetime(timestamps, unit="s").normalize(), - "bloomberg_ticker": ticker, + Provider.numerai_ticker_col: ticker, "open": np.array(opens, dtype='float32'), "high": np.array(highs, dtype='float32'), "low": np.array(lows, dtype='float32'), diff --git a/src/opensignals/features.py b/src/opensignals/features.py index 7a64a84..8e08e2c 100644 --- a/src/opensignals/features.py +++ b/src/opensignals/features.py @@ -1,4 +1,6 @@ +import datetime as dt import logging +from abc import ABC, abstractmethod from typing import List, Optional, Protocol, Dict, Tuple, Union import numpy as np @@ -7,7 +9,19 @@ logger = logging.getLogger(__name__) -class FeatureGenerator(Protocol): +class FeatureGenerator(ABC): + + numerai_ticker_col = 'bloomberg_ticker' + + def __init__(self): + # once round 665 opens on 2024-01-23, bloomberg_ticker will be renamed to numerai_ticker + target_time = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc) + if dt.datetime.now(dt.timezone.utc) >= target_time: + FeatureGenerator.numerai_ticker_col = 'numerai_ticker' + else: + FeatureGenerator.numerai_ticker_col = 'bloomberg_ticker' + + @abstractmethod def generate_features(self, ticker_data: pd.DataFrame, feature_prefix: Optional[str] = None) -> Tuple[pd.DataFrame, List[str]]: pass @@ -27,7 +41,7 @@ def generate_features(self, if feature_prefix: feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}' - ticker_groups = ticker_data.groupby('bloomberg_ticker') + ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col) ticker_data[feature_prefix_name] = \ ticker_groups[self.variable].transform( lambda x: x.shift(self.num_days)) @@ -103,7 +117,7 @@ def generate_features(self, if feature_prefix: feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}' - ticker_groups = ticker_data.groupby('bloomberg_ticker') + ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col) ticker_data[feature_prefix_name] = \ ticker_groups[self.variable].transform( lambda x: self.relative_strength_index(x, self.interval) @@ -129,7 +143,7 @@ def generate_features(self, # create lagged features grouped by ticker logger.debug('grouping by ticker...') - ticker_groups = ticker_data.groupby('bloomberg_ticker') + ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col) # lag 0 is that day's value, lag 1 is yesterday's value, etc logger.debug('generating lagged RSI quintiles...') @@ -192,7 +206,7 @@ def generate_features(self, if feature_prefix: feature_prefix_name = f'{feature_prefix}_{feature_prefix_name}' - ticker_groups = ticker_data.groupby('bloomberg_ticker') + ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col) ticker_data[feature_prefix_name] = \ ticker_groups[self.variable].transform( lambda x: self.simple_moving_average(x, self.interval) @@ -218,7 +232,7 @@ def generate_features(self, # create lagged features grouped by ticker logger.debug('grouping by ticker...') - ticker_groups = ticker_data.groupby('bloomberg_ticker') + ticker_groups = ticker_data.groupby(FeatureGenerator.numerai_ticker_col) # lag 0 is that day's value, lag 1 is yesterday's value, etc logger.debug('generating lagged SMA quintiles...') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_data_pipeline.py b/tests/integration/test_data_pipeline.py new file mode 100644 index 0000000..1eda14f --- /dev/null +++ b/tests/integration/test_data_pipeline.py @@ -0,0 +1,46 @@ +import unittest +from pathlib import Path + +from opensignals.data.yahoo import Yahoo +from opensignals.features import RSI, SMA + + +class TestDataPipeline(unittest.TestCase): + + def test_yahoo_rsi_data(self): + db_dir = Path('db') + + yahoo = Yahoo() + yahoo.download_data(db_dir) + + features_generators = [ + RSI(num_days=5, interval=14, variable='adj_close'), + RSI(num_days=5, interval=21, variable='adj_close'), + SMA(num_days=5, interval=14, variable='adj_close'), + SMA(num_days=5, interval=21, variable='adj_close'), + ] + + train, test, live, feature_names = yahoo.get_data(db_dir, + features_generators=features_generators, + feature_prefix='feature') + + # check that all the features are in each dataset + for df in [train, test, live]: + for feature_name in feature_names: + self.assertTrue(feature_name in df.columns) + + # live dataset should be at least 3000 tickers + self.assertTrue(live.shape[0] > 3000) + + # training dataset should be 859750 rows + self.assertTrue(train.shape[0] >= 859750) + + # test dataset should be at least 1830145 rows + self.assertTrue(test.shape[0] >= 1830145) + + # TODO: what assertions for data? + # feature_names are expected... + # live df > 3000 rows? + # training df should always be same length (859750) + # test df should be at least (1830145) + print(train) \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_feature_generator_numerai_ticker_col.py b/tests/unit/test_feature_generator_numerai_ticker_col.py new file mode 100644 index 0000000..1f396d7 --- /dev/null +++ b/tests/unit/test_feature_generator_numerai_ticker_col.py @@ -0,0 +1,34 @@ +import unittest +from unittest.mock import patch +import datetime as dt +from opensignals.features import RSI + + +class TestFeatureGeneratorNumeraiTickerCol(unittest.TestCase): + + @patch('opensignals.features.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_before_datetime(self, mock_datetime): + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 12, 0, 0, tzinfo=dt.timezone.utc) + + rsi = RSI() + self.assertEqual(rsi.numerai_ticker_col, 'bloomberg_ticker') + + @patch('opensignals.features.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_on_datetime(self, mock_datetime): + # Set the current time to exactly 2024-01-23 13:00:00 UTC + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc) + + rsi = RSI() + self.assertEqual(rsi.numerai_ticker_col, 'numerai_ticker') + + @patch('opensignals.features.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_after_datetime(self, mock_datetime): + # Set the current time to after 2024-01-23 13:00:00 UTC + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 14, 0, 0, tzinfo=dt.timezone.utc) + + rsi = RSI() + self.assertEqual(rsi.numerai_ticker_col, 'numerai_ticker') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/unit/test_provider_numerai_ticker_col.py b/tests/unit/test_provider_numerai_ticker_col.py new file mode 100644 index 0000000..fd5a5d1 --- /dev/null +++ b/tests/unit/test_provider_numerai_ticker_col.py @@ -0,0 +1,34 @@ +import unittest +from unittest.mock import patch +import datetime as dt +from opensignals.data.yahoo import Yahoo + + +class TestProviderNumeraiTickerCol(unittest.TestCase): + + @patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_before_datetime(self, mock_datetime): + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 12, 0, 0, tzinfo=dt.timezone.utc) + + provider = Yahoo() + self.assertEqual(provider.numerai_ticker_col, 'bloomberg_ticker') + + @patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_on_datetime(self, mock_datetime): + # Set the current time to exactly 2024-01-23 13:00:00 UTC + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 13, 0, 0, tzinfo=dt.timezone.utc) + + provider = Yahoo() + self.assertEqual(provider.numerai_ticker_col, 'numerai_ticker') + + @patch('opensignals.data.provider.dt.datetime', wraps=dt.datetime) + def test_numerai_ticker_col_after_datetime(self, mock_datetime): + # Set the current time to after 2024-01-23 13:00:00 UTC + mock_datetime.now.return_value = dt.datetime(2024, 1, 23, 14, 0, 0, tzinfo=dt.timezone.utc) + + provider = Yahoo() + self.assertEqual(provider.numerai_ticker_col, 'numerai_ticker') + + +if __name__ == '__main__': + unittest.main()