boosted.api.api_util
1# Copyright (C) 2020 Gradient Boosted Investments, Inc. - All Rights Reserved 2 3import datetime 4import logging 5from collections import OrderedDict 6from typing import Any, Dict, Optional, Tuple 7 8import pandas as pd 9from boosted.api.api_type import ( 10 BoostedAPIException, 11 BoostedDataSetSchemaException, 12 BoostedDate, 13 ColumnConfig, 14 ColumnRole, 15 ColumnSubRole, 16 ColumnValueType, 17 DataSetConfig, 18 DataSetFrequency, 19 DataSetSubType, 20 DataSetType, 21 StrategyConfig, 22) 23 24# import numpy as np 25 26 27logger = logging.getLogger("boosted.api.api_util") 28 29 30def infer_dataset_schema( 31 name: str, 32 df: pd.DataFrame, 33 dataset_type: DataSetType, 34 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 35 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 36 infer_dataset_report_period: bool = False, 37 infer_from_column_names: bool = False, 38) -> DataSetConfig: 39 # Sanity checks: 40 # Time index 41 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 42 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 43 if len(df.columns) == 0: 44 raise BoostedDataSetSchemaException("No feature columns exist.") 45 46 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 47 # More than two columns, one goal, one feature 48 non_stock_identifier_seen = False 49 50 def variable(name: str) -> ColumnConfig: 51 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 52 53 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 54 # process stock identifiers first, ensuring that they all lie grouped 55 # at the front of the column list 56 if ( 57 infer_from_column_names 58 and dataset_type == DataSetType.STOCK 59 and not non_stock_identifier_seen 60 ): 61 # make a good effort to match column names with identifiers by stripping out 62 # punctuation and lowercasing. 63 sub_role_match = ColumnSubRole.get_match(c[0]) 64 if sub_role_match: 65 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 66 datasetConfig.addColumn(f) 67 continue 68 # end stock identifiers processing sequence as soon as we see something that's not 69 # an identifier 70 non_stock_identifier_seen = True 71 72 if dataset_type == DataSetType.STRATEGY and i < 1: 73 # Don't need to add the first column to the schema. 74 # It is assumed to be the security identifier. 75 strategy_column_name = c[0] 76 elif ( 77 not infer_from_column_names 78 and dataset_type == DataSetType.STOCK 79 and ( 80 (dataset_subtype == DataSetSubType.DENSE and i < 3) 81 or ( 82 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 83 and i < 5 84 ) 85 ) 86 ): 87 if i == 0: 88 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 89 elif i == 1: 90 f = ColumnConfig( 91 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 92 ) 93 elif i == 2: 94 f = ColumnConfig( 95 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 96 ) 97 elif i == 3: 98 f = ColumnConfig( 99 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 100 ) 101 elif i == 4: 102 if infer_dataset_report_period: 103 f = variable(c[0]) 104 else: 105 f = ColumnConfig( 106 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 107 ) 108 datasetConfig.addColumn(f) 109 elif pd.api.types.is_numeric_dtype(c[1]): 110 # Is numeric 111 f = variable(c[0]) 112 datasetConfig.addColumn(f) 113 else: 114 logger.info( 115 "Only numeric types are supported now." 116 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 117 ) 118 if dataset_type == DataSetType.STRATEGY: 119 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 120 for strategy in strategies: 121 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 122 return datasetConfig 123 124 125def validate_start_and_end_dates( 126 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 127) -> None: 128 """Validate if start_date is earlier than end_date 129 130 Args: 131 start_date (BoostedDate): a date object or a valid ISO date string 132 end_date (BoostedDate): a date object or a valid ISO date string 133 134 Raises: 135 ValueError: the arguments are not date object or a valid ISO date string 136 """ 137 if isinstance(start_date, str): 138 try: 139 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 140 except ValueError: 141 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 142 if isinstance(end_date, str): 143 try: 144 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 145 except ValueError: 146 raise ValueError("End date must be a valid YYYY-MM-DD string.") 147 if start_date and end_date and start_date > end_date: 148 raise ValueError("Start date cannot be after end date!") 149 150 151def get_valid_iso_dates( 152 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 153) -> Tuple[str, str]: 154 if (start_date and not end_date) or (end_date and not start_date): 155 raise BoostedAPIException("Must provide both start and end dates or neither") 156 elif not end_date and not start_date: 157 end_date = datetime.date.today() 158 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 159 end_date_iso = end_date.isoformat() 160 start_date_iso = start_date.isoformat() 161 else: 162 if isinstance(start_date, datetime.date): 163 start_date_iso = start_date.isoformat() 164 else: 165 start_date_iso = start_date # type: ignore 166 if isinstance(end_date, datetime.date): 167 end_date_iso = end_date.isoformat() 168 else: 169 end_date_iso = end_date # type: ignore 170 171 return start_date_iso, end_date_iso 172 173 174# TODO this function is not called by anyone. Consider to remove it 175def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 176 maxrows = 30000 177 periods = ["Y", "M"] 178 for p in periods: 179 sizes = [] 180 # mypy can't tell df.index is DatetimeIndex 181 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 182 print(t) 183 sizes.append(len(df.loc[str(t)])) 184 if max(sizes) < maxrows: 185 return p 186 # If we got here, no period worked. 187 return None 188 189 190def protoCubeJsonDataToDataFrame( 191 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 192) -> pd.DataFrame: 193 pc_list = [] 194 for row_idx, row in enumerate(rows): 195 for col_idx, col in enumerate(columns): 196 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 197 df = pd.DataFrame(pc_list) 198 df = df.set_axis([row_name, column_name] + fields, axis="columns") 199 df.set_index([row_name, column_name], inplace=True) 200 return df 201 202 203def getMaxClientVersion() -> Dict[str, Any]: 204 import re 205 206 import requests 207 208 base_url = "https://boosted-downloads.s3.amazonaws.com/" 209 url = f"{base_url}?list-type=2" 210 OK = 200 211 resp = requests.get(url) 212 213 if resp.status_code == OK: 214 groups = [ 215 x 216 for x in re.finditer( 217 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 218 ) 219 ] 220 221 def getVersion(x: Dict[str, Any]) -> Tuple: 222 return x["version"] 223 224 max_info = max( 225 [ 226 { 227 "version": tuple(map(int, group.group(2).split("."))), 228 "weblink": f"{base_url}{group.group(1)}", 229 } 230 for group in groups 231 ], 232 key=getVersion, 233 ) 234 max_info["version"] = str(".".join(map(str, max_info["version"]))) 235 return max_info 236 else: 237 raise ValueError(f"Bad Response Code: {resp.status_code}")
logger =
<Logger boosted.api.api_util (WARNING)>
def
infer_dataset_schema( name: str, df: pandas.core.frame.DataFrame, dataset_type: boosted.api.api_type.DataSetType, dataset_subtype: boosted.api.api_type.DataSetSubType = <DataSetSubType.DENSE: 1>, dataset_frequency: boosted.api.api_type.DataSetFrequency = <DataSetFrequency.DAILY: 1>, infer_dataset_report_period: bool = False, infer_from_column_names: bool = False) -> boosted.api.api_type.DataSetConfig:
31def infer_dataset_schema( 32 name: str, 33 df: pd.DataFrame, 34 dataset_type: DataSetType, 35 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 36 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 37 infer_dataset_report_period: bool = False, 38 infer_from_column_names: bool = False, 39) -> DataSetConfig: 40 # Sanity checks: 41 # Time index 42 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 43 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 44 if len(df.columns) == 0: 45 raise BoostedDataSetSchemaException("No feature columns exist.") 46 47 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 48 # More than two columns, one goal, one feature 49 non_stock_identifier_seen = False 50 51 def variable(name: str) -> ColumnConfig: 52 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 53 54 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 55 # process stock identifiers first, ensuring that they all lie grouped 56 # at the front of the column list 57 if ( 58 infer_from_column_names 59 and dataset_type == DataSetType.STOCK 60 and not non_stock_identifier_seen 61 ): 62 # make a good effort to match column names with identifiers by stripping out 63 # punctuation and lowercasing. 64 sub_role_match = ColumnSubRole.get_match(c[0]) 65 if sub_role_match: 66 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 67 datasetConfig.addColumn(f) 68 continue 69 # end stock identifiers processing sequence as soon as we see something that's not 70 # an identifier 71 non_stock_identifier_seen = True 72 73 if dataset_type == DataSetType.STRATEGY and i < 1: 74 # Don't need to add the first column to the schema. 75 # It is assumed to be the security identifier. 76 strategy_column_name = c[0] 77 elif ( 78 not infer_from_column_names 79 and dataset_type == DataSetType.STOCK 80 and ( 81 (dataset_subtype == DataSetSubType.DENSE and i < 3) 82 or ( 83 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 84 and i < 5 85 ) 86 ) 87 ): 88 if i == 0: 89 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 90 elif i == 1: 91 f = ColumnConfig( 92 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 93 ) 94 elif i == 2: 95 f = ColumnConfig( 96 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 97 ) 98 elif i == 3: 99 f = ColumnConfig( 100 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 101 ) 102 elif i == 4: 103 if infer_dataset_report_period: 104 f = variable(c[0]) 105 else: 106 f = ColumnConfig( 107 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 108 ) 109 datasetConfig.addColumn(f) 110 elif pd.api.types.is_numeric_dtype(c[1]): 111 # Is numeric 112 f = variable(c[0]) 113 datasetConfig.addColumn(f) 114 else: 115 logger.info( 116 "Only numeric types are supported now." 117 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 118 ) 119 if dataset_type == DataSetType.STRATEGY: 120 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 121 for strategy in strategies: 122 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 123 return datasetConfig
def
validate_start_and_end_dates( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> None:
126def validate_start_and_end_dates( 127 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 128) -> None: 129 """Validate if start_date is earlier than end_date 130 131 Args: 132 start_date (BoostedDate): a date object or a valid ISO date string 133 end_date (BoostedDate): a date object or a valid ISO date string 134 135 Raises: 136 ValueError: the arguments are not date object or a valid ISO date string 137 """ 138 if isinstance(start_date, str): 139 try: 140 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 141 except ValueError: 142 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 143 if isinstance(end_date, str): 144 try: 145 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 146 except ValueError: 147 raise ValueError("End date must be a valid YYYY-MM-DD string.") 148 if start_date and end_date and start_date > end_date: 149 raise ValueError("Start date cannot be after end date!")
Validate if start_date is earlier than end_date
Args: start_date (BoostedDate): a date object or a valid ISO date string end_date (BoostedDate): a date object or a valid ISO date string
Raises: ValueError: the arguments are not date object or a valid ISO date string
def
get_valid_iso_dates( start_date: Union[datetime.date, str, NoneType] = None, end_date: Union[datetime.date, str, NoneType] = None) -> Tuple[str, str]:
152def get_valid_iso_dates( 153 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 154) -> Tuple[str, str]: 155 if (start_date and not end_date) or (end_date and not start_date): 156 raise BoostedAPIException("Must provide both start and end dates or neither") 157 elif not end_date and not start_date: 158 end_date = datetime.date.today() 159 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 160 end_date_iso = end_date.isoformat() 161 start_date_iso = start_date.isoformat() 162 else: 163 if isinstance(start_date, datetime.date): 164 start_date_iso = start_date.isoformat() 165 else: 166 start_date_iso = start_date # type: ignore 167 if isinstance(end_date, datetime.date): 168 end_date_iso = end_date.isoformat() 169 else: 170 end_date_iso = end_date # type: ignore 171 172 return start_date_iso, end_date_iso
def
estimateUploadBatchPeriod(df: pandas.core.frame.DataFrame) -> Union[str, NoneType]:
176def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 177 maxrows = 30000 178 periods = ["Y", "M"] 179 for p in periods: 180 sizes = [] 181 # mypy can't tell df.index is DatetimeIndex 182 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 183 print(t) 184 sizes.append(len(df.loc[str(t)])) 185 if max(sizes) < maxrows: 186 return p 187 # If we got here, no period worked. 188 return None
def
protoCubeJsonDataToDataFrame( pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list) -> pandas.core.frame.DataFrame:
191def protoCubeJsonDataToDataFrame( 192 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 193) -> pd.DataFrame: 194 pc_list = [] 195 for row_idx, row in enumerate(rows): 196 for col_idx, col in enumerate(columns): 197 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 198 df = pd.DataFrame(pc_list) 199 df = df.set_axis([row_name, column_name] + fields, axis="columns") 200 df.set_index([row_name, column_name], inplace=True) 201 return df
def
getMaxClientVersion() -> Dict[str, Any]:
204def getMaxClientVersion() -> Dict[str, Any]: 205 import re 206 207 import requests 208 209 base_url = "https://boosted-downloads.s3.amazonaws.com/" 210 url = f"{base_url}?list-type=2" 211 OK = 200 212 resp = requests.get(url) 213 214 if resp.status_code == OK: 215 groups = [ 216 x 217 for x in re.finditer( 218 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 219 ) 220 ] 221 222 def getVersion(x: Dict[str, Any]) -> Tuple: 223 return x["version"] 224 225 max_info = max( 226 [ 227 { 228 "version": tuple(map(int, group.group(2).split("."))), 229 "weblink": f"{base_url}{group.group(1)}", 230 } 231 for group in groups 232 ], 233 key=getVersion, 234 ) 235 max_info["version"] = str(".".join(map(str, max_info["version"]))) 236 return max_info 237 else: 238 raise ValueError(f"Bad Response Code: {resp.status_code}")