boosted.api.api_util
1# Copyright (C) 2020 Gradient Boosted Investments, Inc. - All Rights Reserved 2 3import datetime 4import logging 5from collections import OrderedDict 6 7import pandas as pd 8from dateutil import parser 9 10# import numpy as np 11 12from typing import Any, Dict, Optional, Tuple, Union 13 14from boosted.api.api_type import ( 15 CustomNamespaceVariableRole, 16 BoostedAPIException, 17 BoostedDataSetSchemaException, 18 BoostedDate, 19 ColumnConfig, 20 ColumnRole, 21 ColumnSubRole, 22 ColumnValueType, 23 DataSetConfig, 24 DataSetFrequency, 25 DataSetSubType, 26 DataSetType, 27 StrategyConfig, 28) 29 30# import numpy as np 31 32 33logger = logging.getLogger("boosted.api.api_util") 34 35DEFAULT_WEEKS = 5 36 37 38def to_camel_case(key: Optional[str]) -> Optional[str]: 39 if not key: 40 return key 41 return f"{key[0].upper()}{key[1:].lower()}" 42 43 44def infer_dataset_schema( 45 name: str, 46 df: pd.DataFrame, 47 dataset_type: DataSetType, 48 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 49 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 50 infer_dataset_report_period: bool = False, 51 infer_from_column_names: bool = False, 52) -> DataSetConfig: 53 # Sanity checks: 54 # Time index 55 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 56 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 57 if len(df.columns) == 0: 58 raise BoostedDataSetSchemaException("No feature columns exist.") 59 60 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 61 # More than two columns, one goal, one feature 62 non_stock_identifier_seen = False 63 64 def variable(name: str) -> ColumnConfig: 65 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 66 67 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 68 # process stock identifiers first, ensuring that they all lie grouped 69 # at the front of the column list 70 if ( 71 infer_from_column_names 72 and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY] 73 and not non_stock_identifier_seen 74 ): 75 # make a good effort to match column names with identifiers by stripping out 76 # punctuation and lowercasing. 77 sub_role_match = ColumnSubRole.get_match(c[0]) 78 if sub_role_match: 79 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 80 datasetConfig.addColumn(f) 81 continue 82 83 custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0]) 84 if custon_namespace_variable_role_match: 85 f = ColumnConfig( 86 name=c[0], 87 role=ColumnRole.VARIABLE, 88 custom_namespace_variable_role=custon_namespace_variable_role_match, 89 ) 90 datasetConfig.addColumn(f) 91 continue 92 93 # end stock identifiers processing sequence as soon as we see something that's not 94 # an identifier 95 non_stock_identifier_seen = True 96 97 if dataset_type == DataSetType.STRATEGY and i < 1: 98 # Don't need to add the first column to the schema. 99 # It is assumed to be the security identifier. 100 strategy_column_name = c[0] 101 elif ( 102 not infer_from_column_names 103 and dataset_type == DataSetType.STOCK 104 and ( 105 (dataset_subtype == DataSetSubType.DENSE and i < 3) 106 or ( 107 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 108 and i < 5 109 ) 110 ) 111 ): 112 if i == 0: 113 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 114 elif i == 1: 115 f = ColumnConfig( 116 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 117 ) 118 elif i == 2: 119 f = ColumnConfig( 120 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 121 ) 122 elif i == 3: 123 f = ColumnConfig( 124 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 125 ) 126 elif i == 4: 127 if infer_dataset_report_period: 128 f = variable(c[0]) 129 else: 130 f = ColumnConfig( 131 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 132 ) 133 datasetConfig.addColumn(f) 134 elif dataset_type == DataSetType.SECURITIES_DAILY: 135 raise NotImplementedError( 136 f"Can not infer order for custom security daily dataset " 137 + "without exact column name match as hints! Please see DataSetConfig to create " 138 + f"your own dataset schema. (column = {c[0]})" 139 ) 140 elif pd.api.types.is_numeric_dtype(c[1]): 141 # Is numeric 142 f = variable(c[0]) 143 datasetConfig.addColumn(f) 144 else: 145 logger.info( 146 "Only numeric types are supported now." 147 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 148 ) 149 if dataset_type == DataSetType.STRATEGY: 150 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 151 for strategy in strategies: 152 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 153 return datasetConfig 154 155 156def validate_start_and_end_dates( 157 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 158) -> None: 159 """Validate if start_date is earlier than end_date 160 161 Args: 162 start_date (BoostedDate): a date object or a valid ISO date string 163 end_date (BoostedDate): a date object or a valid ISO date string 164 165 Raises: 166 ValueError: the arguments are not date object or a valid ISO date string 167 """ 168 if isinstance(start_date, str): 169 try: 170 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 171 except ValueError: 172 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 173 if isinstance(end_date, str): 174 try: 175 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 176 except ValueError: 177 raise ValueError("End date must be a valid YYYY-MM-DD string.") 178 if start_date and end_date and start_date > end_date: 179 raise ValueError("Start date cannot be after end date!") 180 181 182def convert_date(date: BoostedDate) -> datetime.date: 183 if isinstance(date, str): 184 try: 185 return parser.parse(date).date() 186 except Exception as e: 187 raise BoostedAPIException(f"Unable to parse date: {str(e)}") 188 return date 189 190 191def get_date_range( 192 start_date: Optional[Union[datetime.date, str]], end_date: Optional[Union[datetime.date, str]] 193) -> Tuple[datetime.date, datetime.date]: 194 195 today = datetime.date.today() 196 if not start_date: 197 if not end_date: 198 start_date = today - datetime.timedelta(weeks=DEFAULT_WEEKS) 199 else: 200 start_date = convert_date(end_date) - datetime.timedelta(weeks=DEFAULT_WEEKS) 201 else: 202 start_date = convert_date(start_date) 203 204 if not end_date: 205 end_date = min(today, start_date + datetime.timedelta(weeks=DEFAULT_WEEKS)) 206 else: 207 end_date = convert_date(end_date) 208 return start_date, end_date 209 210 211def get_valid_iso_dates( 212 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 213) -> Tuple[str, str]: 214 if (start_date and not end_date) or (end_date and not start_date): 215 raise BoostedAPIException("Must provide both start and end dates or neither") 216 elif not end_date and not start_date: 217 end_date = datetime.date.today() 218 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 219 end_date_iso = end_date.isoformat() 220 start_date_iso = start_date.isoformat() 221 else: 222 if isinstance(start_date, datetime.date): 223 start_date_iso = start_date.isoformat() 224 else: 225 start_date_iso = start_date # type: ignore 226 if isinstance(end_date, datetime.date): 227 end_date_iso = end_date.isoformat() 228 else: 229 end_date_iso = end_date # type: ignore 230 231 return start_date_iso, end_date_iso 232 233 234# TODO this function is not called by anyone. Consider to remove it 235def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 236 maxrows = 30000 237 periods = ["Y", "M"] 238 for p in periods: 239 sizes = [] 240 # mypy can't tell df.index is DatetimeIndex 241 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 242 print(t) 243 sizes.append(len(df.loc[str(t)])) 244 if max(sizes) < maxrows: 245 return p 246 # If we got here, no period worked. 247 return None 248 249 250def protoCubeJsonDataToDataFrame( 251 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 252) -> pd.DataFrame: 253 pc_list = [] 254 for row_idx, row in enumerate(rows): 255 for col_idx, col in enumerate(columns): 256 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 257 df = pd.DataFrame(pc_list) 258 df = df.set_axis([row_name, column_name] + fields, axis="columns") 259 df.set_index([row_name, column_name], inplace=True) 260 return df 261 262 263def getMaxClientVersion() -> Dict[str, Any]: 264 import re 265 266 import requests 267 268 base_url = "https://boosted-downloads.s3.amazonaws.com/" 269 url = f"{base_url}?list-type=2" 270 OK = 200 271 resp = requests.get(url) 272 273 if resp.status_code == OK: 274 groups = [ 275 x 276 for x in re.finditer( 277 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 278 ) 279 ] 280 281 def getVersion(x: Dict[str, Any]) -> Tuple: 282 return x["version"] 283 284 max_info = max( 285 [ 286 { 287 "version": tuple(map(int, group.group(2).split("."))), 288 "weblink": f"{base_url}{group.group(1)}", 289 } 290 for group in groups 291 ], 292 key=getVersion, 293 ) 294 max_info["version"] = str(".".join(map(str, max_info["version"]))) 295 return max_info 296 else: 297 raise ValueError(f"Bad Response Code: {resp.status_code}")
logger =
<Logger boosted.api.api_util (WARNING)>
DEFAULT_WEEKS =
5
def
to_camel_case(key: Union[str, NoneType]) -> Union[str, NoneType]:
def
infer_dataset_schema( name: str, df: pandas.core.frame.DataFrame, dataset_type: boosted.api.api_type.DataSetType, dataset_subtype: boosted.api.api_type.DataSetSubType = <DataSetSubType.DENSE: 1>, dataset_frequency: boosted.api.api_type.DataSetFrequency = <DataSetFrequency.DAILY: 1>, infer_dataset_report_period: bool = False, infer_from_column_names: bool = False) -> boosted.api.api_type.DataSetConfig:
45def infer_dataset_schema( 46 name: str, 47 df: pd.DataFrame, 48 dataset_type: DataSetType, 49 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 50 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 51 infer_dataset_report_period: bool = False, 52 infer_from_column_names: bool = False, 53) -> DataSetConfig: 54 # Sanity checks: 55 # Time index 56 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 57 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 58 if len(df.columns) == 0: 59 raise BoostedDataSetSchemaException("No feature columns exist.") 60 61 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 62 # More than two columns, one goal, one feature 63 non_stock_identifier_seen = False 64 65 def variable(name: str) -> ColumnConfig: 66 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 67 68 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 69 # process stock identifiers first, ensuring that they all lie grouped 70 # at the front of the column list 71 if ( 72 infer_from_column_names 73 and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY] 74 and not non_stock_identifier_seen 75 ): 76 # make a good effort to match column names with identifiers by stripping out 77 # punctuation and lowercasing. 78 sub_role_match = ColumnSubRole.get_match(c[0]) 79 if sub_role_match: 80 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 81 datasetConfig.addColumn(f) 82 continue 83 84 custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0]) 85 if custon_namespace_variable_role_match: 86 f = ColumnConfig( 87 name=c[0], 88 role=ColumnRole.VARIABLE, 89 custom_namespace_variable_role=custon_namespace_variable_role_match, 90 ) 91 datasetConfig.addColumn(f) 92 continue 93 94 # end stock identifiers processing sequence as soon as we see something that's not 95 # an identifier 96 non_stock_identifier_seen = True 97 98 if dataset_type == DataSetType.STRATEGY and i < 1: 99 # Don't need to add the first column to the schema. 100 # It is assumed to be the security identifier. 101 strategy_column_name = c[0] 102 elif ( 103 not infer_from_column_names 104 and dataset_type == DataSetType.STOCK 105 and ( 106 (dataset_subtype == DataSetSubType.DENSE and i < 3) 107 or ( 108 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 109 and i < 5 110 ) 111 ) 112 ): 113 if i == 0: 114 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 115 elif i == 1: 116 f = ColumnConfig( 117 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 118 ) 119 elif i == 2: 120 f = ColumnConfig( 121 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 122 ) 123 elif i == 3: 124 f = ColumnConfig( 125 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 126 ) 127 elif i == 4: 128 if infer_dataset_report_period: 129 f = variable(c[0]) 130 else: 131 f = ColumnConfig( 132 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 133 ) 134 datasetConfig.addColumn(f) 135 elif dataset_type == DataSetType.SECURITIES_DAILY: 136 raise NotImplementedError( 137 f"Can not infer order for custom security daily dataset " 138 + "without exact column name match as hints! Please see DataSetConfig to create " 139 + f"your own dataset schema. (column = {c[0]})" 140 ) 141 elif pd.api.types.is_numeric_dtype(c[1]): 142 # Is numeric 143 f = variable(c[0]) 144 datasetConfig.addColumn(f) 145 else: 146 logger.info( 147 "Only numeric types are supported now." 148 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 149 ) 150 if dataset_type == DataSetType.STRATEGY: 151 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 152 for strategy in strategies: 153 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 154 return datasetConfig
def
validate_start_and_end_dates( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> None:
157def validate_start_and_end_dates( 158 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 159) -> None: 160 """Validate if start_date is earlier than end_date 161 162 Args: 163 start_date (BoostedDate): a date object or a valid ISO date string 164 end_date (BoostedDate): a date object or a valid ISO date string 165 166 Raises: 167 ValueError: the arguments are not date object or a valid ISO date string 168 """ 169 if isinstance(start_date, str): 170 try: 171 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 172 except ValueError: 173 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 174 if isinstance(end_date, str): 175 try: 176 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 177 except ValueError: 178 raise ValueError("End date must be a valid YYYY-MM-DD string.") 179 if start_date and end_date and start_date > end_date: 180 raise ValueError("Start date cannot be after end date!")
Validate if start_date is earlier than end_date
Args: start_date (BoostedDate): a date object or a valid ISO date string end_date (BoostedDate): a date object or a valid ISO date string
Raises: ValueError: the arguments are not date object or a valid ISO date string
def
convert_date(date: Union[datetime.date, str]) -> datetime.date:
def
get_date_range( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> Tuple[datetime.date, datetime.date]:
192def get_date_range( 193 start_date: Optional[Union[datetime.date, str]], end_date: Optional[Union[datetime.date, str]] 194) -> Tuple[datetime.date, datetime.date]: 195 196 today = datetime.date.today() 197 if not start_date: 198 if not end_date: 199 start_date = today - datetime.timedelta(weeks=DEFAULT_WEEKS) 200 else: 201 start_date = convert_date(end_date) - datetime.timedelta(weeks=DEFAULT_WEEKS) 202 else: 203 start_date = convert_date(start_date) 204 205 if not end_date: 206 end_date = min(today, start_date + datetime.timedelta(weeks=DEFAULT_WEEKS)) 207 else: 208 end_date = convert_date(end_date) 209 return start_date, end_date
def
get_valid_iso_dates( start_date: Union[datetime.date, str, NoneType] = None, end_date: Union[datetime.date, str, NoneType] = None) -> Tuple[str, str]:
212def get_valid_iso_dates( 213 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 214) -> Tuple[str, str]: 215 if (start_date and not end_date) or (end_date and not start_date): 216 raise BoostedAPIException("Must provide both start and end dates or neither") 217 elif not end_date and not start_date: 218 end_date = datetime.date.today() 219 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 220 end_date_iso = end_date.isoformat() 221 start_date_iso = start_date.isoformat() 222 else: 223 if isinstance(start_date, datetime.date): 224 start_date_iso = start_date.isoformat() 225 else: 226 start_date_iso = start_date # type: ignore 227 if isinstance(end_date, datetime.date): 228 end_date_iso = end_date.isoformat() 229 else: 230 end_date_iso = end_date # type: ignore 231 232 return start_date_iso, end_date_iso
def
estimateUploadBatchPeriod(df: pandas.core.frame.DataFrame) -> Union[str, NoneType]:
236def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 237 maxrows = 30000 238 periods = ["Y", "M"] 239 for p in periods: 240 sizes = [] 241 # mypy can't tell df.index is DatetimeIndex 242 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 243 print(t) 244 sizes.append(len(df.loc[str(t)])) 245 if max(sizes) < maxrows: 246 return p 247 # If we got here, no period worked. 248 return None
def
protoCubeJsonDataToDataFrame( pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list) -> pandas.core.frame.DataFrame:
251def protoCubeJsonDataToDataFrame( 252 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 253) -> pd.DataFrame: 254 pc_list = [] 255 for row_idx, row in enumerate(rows): 256 for col_idx, col in enumerate(columns): 257 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 258 df = pd.DataFrame(pc_list) 259 df = df.set_axis([row_name, column_name] + fields, axis="columns") 260 df.set_index([row_name, column_name], inplace=True) 261 return df
def
getMaxClientVersion() -> Dict[str, Any]:
264def getMaxClientVersion() -> Dict[str, Any]: 265 import re 266 267 import requests 268 269 base_url = "https://boosted-downloads.s3.amazonaws.com/" 270 url = f"{base_url}?list-type=2" 271 OK = 200 272 resp = requests.get(url) 273 274 if resp.status_code == OK: 275 groups = [ 276 x 277 for x in re.finditer( 278 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 279 ) 280 ] 281 282 def getVersion(x: Dict[str, Any]) -> Tuple: 283 return x["version"] 284 285 max_info = max( 286 [ 287 { 288 "version": tuple(map(int, group.group(2).split("."))), 289 "weblink": f"{base_url}{group.group(1)}", 290 } 291 for group in groups 292 ], 293 key=getVersion, 294 ) 295 max_info["version"] = str(".".join(map(str, max_info["version"]))) 296 return max_info 297 else: 298 raise ValueError(f"Bad Response Code: {resp.status_code}")