boosted.api.api_util
1# Copyright (C) 2020 Gradient Boosted Investments, Inc. - All Rights Reserved 2 3import datetime 4import logging 5from collections import OrderedDict 6 7import pandas as pd 8 9# import numpy as np 10 11from typing import Any, Dict, Optional, Tuple 12 13from boosted.api.api_type import ( 14 CustomNamespaceVariableRole, 15 BoostedAPIException, 16 BoostedDataSetSchemaException, 17 BoostedDate, 18 ColumnConfig, 19 ColumnRole, 20 ColumnSubRole, 21 ColumnValueType, 22 DataSetConfig, 23 DataSetFrequency, 24 DataSetSubType, 25 DataSetType, 26 StrategyConfig, 27) 28 29# import numpy as np 30 31 32logger = logging.getLogger("boosted.api.api_util") 33 34 35def to_camel_case(key: Optional[str]) -> Optional[str]: 36 if not key: 37 return key 38 return f"{key[0].upper()}{key[1:].lower()}" 39 40 41def infer_dataset_schema( 42 name: str, 43 df: pd.DataFrame, 44 dataset_type: DataSetType, 45 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 46 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 47 infer_dataset_report_period: bool = False, 48 infer_from_column_names: bool = False, 49) -> DataSetConfig: 50 # Sanity checks: 51 # Time index 52 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 53 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 54 if len(df.columns) == 0: 55 raise BoostedDataSetSchemaException("No feature columns exist.") 56 57 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 58 # More than two columns, one goal, one feature 59 non_stock_identifier_seen = False 60 61 def variable(name: str) -> ColumnConfig: 62 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 63 64 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 65 # process stock identifiers first, ensuring that they all lie grouped 66 # at the front of the column list 67 if ( 68 infer_from_column_names 69 and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY] 70 and not non_stock_identifier_seen 71 ): 72 # make a good effort to match column names with identifiers by stripping out 73 # punctuation and lowercasing. 74 sub_role_match = ColumnSubRole.get_match(c[0]) 75 if sub_role_match: 76 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 77 datasetConfig.addColumn(f) 78 continue 79 80 custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0]) 81 if custon_namespace_variable_role_match: 82 f = ColumnConfig( 83 name=c[0], 84 role=ColumnRole.VARIABLE, 85 custom_namespace_variable_role=custon_namespace_variable_role_match, 86 ) 87 datasetConfig.addColumn(f) 88 continue 89 90 # end stock identifiers processing sequence as soon as we see something that's not 91 # an identifier 92 non_stock_identifier_seen = True 93 94 if dataset_type == DataSetType.STRATEGY and i < 1: 95 # Don't need to add the first column to the schema. 96 # It is assumed to be the security identifier. 97 strategy_column_name = c[0] 98 elif ( 99 not infer_from_column_names 100 and dataset_type == DataSetType.STOCK 101 and ( 102 (dataset_subtype == DataSetSubType.DENSE and i < 3) 103 or ( 104 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 105 and i < 5 106 ) 107 ) 108 ): 109 if i == 0: 110 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 111 elif i == 1: 112 f = ColumnConfig( 113 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 114 ) 115 elif i == 2: 116 f = ColumnConfig( 117 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 118 ) 119 elif i == 3: 120 f = ColumnConfig( 121 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 122 ) 123 elif i == 4: 124 if infer_dataset_report_period: 125 f = variable(c[0]) 126 else: 127 f = ColumnConfig( 128 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 129 ) 130 datasetConfig.addColumn(f) 131 elif dataset_type == DataSetType.SECURITIES_DAILY: 132 raise NotImplementedError( 133 f"Can not infer order for custom security daily dataset " 134 + "without exact column name match as hints! Please see DataSetConfig to create " 135 + f"your own dataset schema. (column = {c[0]})" 136 ) 137 elif pd.api.types.is_numeric_dtype(c[1]): 138 # Is numeric 139 f = variable(c[0]) 140 datasetConfig.addColumn(f) 141 else: 142 logger.info( 143 "Only numeric types are supported now." 144 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 145 ) 146 if dataset_type == DataSetType.STRATEGY: 147 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 148 for strategy in strategies: 149 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 150 return datasetConfig 151 152 153def validate_start_and_end_dates( 154 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 155) -> None: 156 """Validate if start_date is earlier than end_date 157 158 Args: 159 start_date (BoostedDate): a date object or a valid ISO date string 160 end_date (BoostedDate): a date object or a valid ISO date string 161 162 Raises: 163 ValueError: the arguments are not date object or a valid ISO date string 164 """ 165 if isinstance(start_date, str): 166 try: 167 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 168 except ValueError: 169 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 170 if isinstance(end_date, str): 171 try: 172 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 173 except ValueError: 174 raise ValueError("End date must be a valid YYYY-MM-DD string.") 175 if start_date and end_date and start_date > end_date: 176 raise ValueError("Start date cannot be after end date!") 177 178 179def get_valid_iso_dates( 180 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 181) -> Tuple[str, str]: 182 if (start_date and not end_date) or (end_date and not start_date): 183 raise BoostedAPIException("Must provide both start and end dates or neither") 184 elif not end_date and not start_date: 185 end_date = datetime.date.today() 186 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 187 end_date_iso = end_date.isoformat() 188 start_date_iso = start_date.isoformat() 189 else: 190 if isinstance(start_date, datetime.date): 191 start_date_iso = start_date.isoformat() 192 else: 193 start_date_iso = start_date # type: ignore 194 if isinstance(end_date, datetime.date): 195 end_date_iso = end_date.isoformat() 196 else: 197 end_date_iso = end_date # type: ignore 198 199 return start_date_iso, end_date_iso 200 201 202# TODO this function is not called by anyone. Consider to remove it 203def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 204 maxrows = 30000 205 periods = ["Y", "M"] 206 for p in periods: 207 sizes = [] 208 # mypy can't tell df.index is DatetimeIndex 209 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 210 print(t) 211 sizes.append(len(df.loc[str(t)])) 212 if max(sizes) < maxrows: 213 return p 214 # If we got here, no period worked. 215 return None 216 217 218def protoCubeJsonDataToDataFrame( 219 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 220) -> pd.DataFrame: 221 pc_list = [] 222 for row_idx, row in enumerate(rows): 223 for col_idx, col in enumerate(columns): 224 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 225 df = pd.DataFrame(pc_list) 226 df = df.set_axis([row_name, column_name] + fields, axis="columns") 227 df.set_index([row_name, column_name], inplace=True) 228 return df 229 230 231def getMaxClientVersion() -> Dict[str, Any]: 232 import re 233 234 import requests 235 236 base_url = "https://boosted-downloads.s3.amazonaws.com/" 237 url = f"{base_url}?list-type=2" 238 OK = 200 239 resp = requests.get(url) 240 241 if resp.status_code == OK: 242 groups = [ 243 x 244 for x in re.finditer( 245 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 246 ) 247 ] 248 249 def getVersion(x: Dict[str, Any]) -> Tuple: 250 return x["version"] 251 252 max_info = max( 253 [ 254 { 255 "version": tuple(map(int, group.group(2).split("."))), 256 "weblink": f"{base_url}{group.group(1)}", 257 } 258 for group in groups 259 ], 260 key=getVersion, 261 ) 262 max_info["version"] = str(".".join(map(str, max_info["version"]))) 263 return max_info 264 else: 265 raise ValueError(f"Bad Response Code: {resp.status_code}")
logger =
<Logger boosted.api.api_util (WARNING)>
def
to_camel_case(key: Union[str, NoneType]) -> Union[str, NoneType]:
def
infer_dataset_schema( name: str, df: pandas.core.frame.DataFrame, dataset_type: boosted.api.api_type.DataSetType, dataset_subtype: boosted.api.api_type.DataSetSubType = <DataSetSubType.DENSE: 1>, dataset_frequency: boosted.api.api_type.DataSetFrequency = <DataSetFrequency.DAILY: 1>, infer_dataset_report_period: bool = False, infer_from_column_names: bool = False) -> boosted.api.api_type.DataSetConfig:
42def infer_dataset_schema( 43 name: str, 44 df: pd.DataFrame, 45 dataset_type: DataSetType, 46 dataset_subtype: DataSetSubType = DataSetSubType.DENSE, 47 dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY, 48 infer_dataset_report_period: bool = False, 49 infer_from_column_names: bool = False, 50) -> DataSetConfig: 51 # Sanity checks: 52 # Time index 53 if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex: 54 raise BoostedDataSetSchemaException("Index must be DatetimeIndex.") 55 if len(df.columns) == 0: 56 raise BoostedDataSetSchemaException("No feature columns exist.") 57 58 datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency) 59 # More than two columns, one goal, one feature 60 non_stock_identifier_seen = False 61 62 def variable(name: str) -> ColumnConfig: 63 return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER) 64 65 for i, c in enumerate(zip(df.columns.values, df.dtypes.values)): 66 # process stock identifiers first, ensuring that they all lie grouped 67 # at the front of the column list 68 if ( 69 infer_from_column_names 70 and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY] 71 and not non_stock_identifier_seen 72 ): 73 # make a good effort to match column names with identifiers by stripping out 74 # punctuation and lowercasing. 75 sub_role_match = ColumnSubRole.get_match(c[0]) 76 if sub_role_match: 77 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match) 78 datasetConfig.addColumn(f) 79 continue 80 81 custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0]) 82 if custon_namespace_variable_role_match: 83 f = ColumnConfig( 84 name=c[0], 85 role=ColumnRole.VARIABLE, 86 custom_namespace_variable_role=custon_namespace_variable_role_match, 87 ) 88 datasetConfig.addColumn(f) 89 continue 90 91 # end stock identifiers processing sequence as soon as we see something that's not 92 # an identifier 93 non_stock_identifier_seen = True 94 95 if dataset_type == DataSetType.STRATEGY and i < 1: 96 # Don't need to add the first column to the schema. 97 # It is assumed to be the security identifier. 98 strategy_column_name = c[0] 99 elif ( 100 not infer_from_column_names 101 and dataset_type == DataSetType.STOCK 102 and ( 103 (dataset_subtype == DataSetSubType.DENSE and i < 3) 104 or ( 105 dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD] 106 and i < 5 107 ) 108 ) 109 ): 110 if i == 0: 111 f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN) 112 elif i == 1: 113 f = ColumnConfig( 114 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY 115 ) 116 elif i == 2: 117 f = ColumnConfig( 118 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY 119 ) 120 elif i == 3: 121 f = ColumnConfig( 122 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE 123 ) 124 elif i == 4: 125 if infer_dataset_report_period: 126 f = variable(c[0]) 127 else: 128 f = ColumnConfig( 129 name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD 130 ) 131 datasetConfig.addColumn(f) 132 elif dataset_type == DataSetType.SECURITIES_DAILY: 133 raise NotImplementedError( 134 f"Can not infer order for custom security daily dataset " 135 + "without exact column name match as hints! Please see DataSetConfig to create " 136 + f"your own dataset schema. (column = {c[0]})" 137 ) 138 elif pd.api.types.is_numeric_dtype(c[1]): 139 # Is numeric 140 f = variable(c[0]) 141 datasetConfig.addColumn(f) 142 else: 143 logger.info( 144 "Only numeric types are supported now." 145 ' IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1]) 146 ) 147 if dataset_type == DataSetType.STRATEGY: 148 strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist())) 149 for strategy in strategies: 150 datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy)) 151 return datasetConfig
def
validate_start_and_end_dates( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> None:
154def validate_start_and_end_dates( 155 start_date: Optional[BoostedDate], end_date: Optional[BoostedDate] 156) -> None: 157 """Validate if start_date is earlier than end_date 158 159 Args: 160 start_date (BoostedDate): a date object or a valid ISO date string 161 end_date (BoostedDate): a date object or a valid ISO date string 162 163 Raises: 164 ValueError: the arguments are not date object or a valid ISO date string 165 """ 166 if isinstance(start_date, str): 167 try: 168 start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 169 except ValueError: 170 raise ValueError("Start date must be a valid YYYY-MM-DD string.") 171 if isinstance(end_date, str): 172 try: 173 end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") 174 except ValueError: 175 raise ValueError("End date must be a valid YYYY-MM-DD string.") 176 if start_date and end_date and start_date > end_date: 177 raise ValueError("Start date cannot be after end date!")
Validate if start_date is earlier than end_date
Args: start_date (BoostedDate): a date object or a valid ISO date string end_date (BoostedDate): a date object or a valid ISO date string
Raises: ValueError: the arguments are not date object or a valid ISO date string
def
get_valid_iso_dates( start_date: Union[datetime.date, str, NoneType] = None, end_date: Union[datetime.date, str, NoneType] = None) -> Tuple[str, str]:
180def get_valid_iso_dates( 181 start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None 182) -> Tuple[str, str]: 183 if (start_date and not end_date) or (end_date and not start_date): 184 raise BoostedAPIException("Must provide both start and end dates or neither") 185 elif not end_date and not start_date: 186 end_date = datetime.date.today() 187 start_date = end_date - datetime.timedelta(days=30) # default to 30 days 188 end_date_iso = end_date.isoformat() 189 start_date_iso = start_date.isoformat() 190 else: 191 if isinstance(start_date, datetime.date): 192 start_date_iso = start_date.isoformat() 193 else: 194 start_date_iso = start_date # type: ignore 195 if isinstance(end_date, datetime.date): 196 end_date_iso = end_date.isoformat() 197 else: 198 end_date_iso = end_date # type: ignore 199 200 return start_date_iso, end_date_iso
def
estimateUploadBatchPeriod(df: pandas.core.frame.DataFrame) -> Union[str, NoneType]:
204def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]: 205 maxrows = 30000 206 periods = ["Y", "M"] 207 for p in periods: 208 sizes = [] 209 # mypy can't tell df.index is DatetimeIndex 210 for t in df.index.to_period(p).unique(): # type: ignore[attr-defined] 211 print(t) 212 sizes.append(len(df.loc[str(t)])) 213 if max(sizes) < maxrows: 214 return p 215 # If we got here, no period worked. 216 return None
def
protoCubeJsonDataToDataFrame( pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list) -> pandas.core.frame.DataFrame:
219def protoCubeJsonDataToDataFrame( 220 pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list 221) -> pd.DataFrame: 222 pc_list = [] 223 for row_idx, row in enumerate(rows): 224 for col_idx, col in enumerate(columns): 225 pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"]) 226 df = pd.DataFrame(pc_list) 227 df = df.set_axis([row_name, column_name] + fields, axis="columns") 228 df.set_index([row_name, column_name], inplace=True) 229 return df
def
getMaxClientVersion() -> Dict[str, Any]:
232def getMaxClientVersion() -> Dict[str, Any]: 233 import re 234 235 import requests 236 237 base_url = "https://boosted-downloads.s3.amazonaws.com/" 238 url = f"{base_url}?list-type=2" 239 OK = 200 240 resp = requests.get(url) 241 242 if resp.status_code == OK: 243 groups = [ 244 x 245 for x in re.finditer( 246 r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text 247 ) 248 ] 249 250 def getVersion(x: Dict[str, Any]) -> Tuple: 251 return x["version"] 252 253 max_info = max( 254 [ 255 { 256 "version": tuple(map(int, group.group(2).split("."))), 257 "weblink": f"{base_url}{group.group(1)}", 258 } 259 for group in groups 260 ], 261 key=getVersion, 262 ) 263 max_info["version"] = str(".".join(map(str, max_info["version"]))) 264 return max_info 265 else: 266 raise ValueError(f"Bad Response Code: {resp.status_code}")