boosted.api.api_util

  1# Copyright (C) 2020 Gradient Boosted Investments, Inc. - All Rights Reserved
  2
  3import datetime
  4import logging
  5from collections import OrderedDict
  6
  7import pandas as pd
  8
  9# import numpy as np
 10
 11from typing import Any, Dict, Optional, Tuple
 12
 13from boosted.api.api_type import (
 14    CustomNamespaceVariableRole,
 15    BoostedAPIException,
 16    BoostedDataSetSchemaException,
 17    BoostedDate,
 18    ColumnConfig,
 19    ColumnRole,
 20    ColumnSubRole,
 21    ColumnValueType,
 22    DataSetConfig,
 23    DataSetFrequency,
 24    DataSetSubType,
 25    DataSetType,
 26    StrategyConfig,
 27)
 28
 29# import numpy as np
 30
 31
 32logger = logging.getLogger("boosted.api.api_util")
 33
 34
 35def to_camel_case(key: Optional[str]) -> Optional[str]:
 36    if not key:
 37        return key
 38    return f"{key[0].upper()}{key[1:].lower()}"
 39
 40
 41def infer_dataset_schema(
 42    name: str,
 43    df: pd.DataFrame,
 44    dataset_type: DataSetType,
 45    dataset_subtype: DataSetSubType = DataSetSubType.DENSE,
 46    dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY,
 47    infer_dataset_report_period: bool = False,
 48    infer_from_column_names: bool = False,
 49) -> DataSetConfig:
 50    # Sanity checks:
 51    # Time index
 52    if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex:
 53        raise BoostedDataSetSchemaException("Index must be DatetimeIndex.")
 54    if len(df.columns) == 0:
 55        raise BoostedDataSetSchemaException("No feature columns exist.")
 56
 57    datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency)
 58    # More than two columns, one goal, one feature
 59    non_stock_identifier_seen = False
 60
 61    def variable(name: str) -> ColumnConfig:
 62        return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER)
 63
 64    for i, c in enumerate(zip(df.columns.values, df.dtypes.values)):
 65        # process stock identifiers first, ensuring that they all lie grouped
 66        # at the front of the column list
 67        if (
 68            infer_from_column_names
 69            and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY]
 70            and not non_stock_identifier_seen
 71        ):
 72            # make a good effort to match column names with identifiers by stripping out
 73            # punctuation and lowercasing.
 74            sub_role_match = ColumnSubRole.get_match(c[0])
 75            if sub_role_match:
 76                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match)
 77                datasetConfig.addColumn(f)
 78                continue
 79
 80            custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0])
 81            if custon_namespace_variable_role_match:
 82                f = ColumnConfig(
 83                    name=c[0],
 84                    role=ColumnRole.VARIABLE,
 85                    custom_namespace_variable_role=custon_namespace_variable_role_match,
 86                )
 87                datasetConfig.addColumn(f)
 88                continue
 89
 90            # end stock identifiers processing sequence as soon as we see something that's not
 91            # an identifier
 92            non_stock_identifier_seen = True
 93
 94        if dataset_type == DataSetType.STRATEGY and i < 1:
 95            # Don't need to add the first column to the schema.
 96            # It is assumed to be the security identifier.
 97            strategy_column_name = c[0]
 98        elif (
 99            not infer_from_column_names
100            and dataset_type == DataSetType.STOCK
101            and (
102                (dataset_subtype == DataSetSubType.DENSE and i < 3)
103                or (
104                    dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD]
105                    and i < 5
106                )
107            )
108        ):
109            if i == 0:
110                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN)
111            elif i == 1:
112                f = ColumnConfig(
113                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY
114                )
115            elif i == 2:
116                f = ColumnConfig(
117                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY
118                )
119            elif i == 3:
120                f = ColumnConfig(
121                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE
122                )
123            elif i == 4:
124                if infer_dataset_report_period:
125                    f = variable(c[0])
126                else:
127                    f = ColumnConfig(
128                        name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD
129                    )
130            datasetConfig.addColumn(f)
131        elif dataset_type == DataSetType.SECURITIES_DAILY:
132            raise NotImplementedError(
133                f"Can not infer order for custom security daily dataset "
134                + "without exact column name match as hints! Please see DataSetConfig to create "
135                + f"your own dataset schema. (column = {c[0]})"
136            )
137        elif pd.api.types.is_numeric_dtype(c[1]):
138            # Is numeric
139            f = variable(c[0])
140            datasetConfig.addColumn(f)
141        else:
142            logger.info(
143                "Only numeric types are supported now."
144                '  IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1])
145            )
146    if dataset_type == DataSetType.STRATEGY:
147        strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist()))
148        for strategy in strategies:
149            datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy))
150    return datasetConfig
151
152
153def validate_start_and_end_dates(
154    start_date: Optional[BoostedDate], end_date: Optional[BoostedDate]
155) -> None:
156    """Validate if start_date is earlier than end_date
157
158    Args:
159        start_date (BoostedDate): a date object or a valid ISO date string
160        end_date (BoostedDate): a date object or a valid ISO date string
161
162    Raises:
163        ValueError: the arguments are not date object or a valid ISO date string
164    """
165    if isinstance(start_date, str):
166        try:
167            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
168        except ValueError:
169            raise ValueError("Start date must be a valid YYYY-MM-DD string.")
170    if isinstance(end_date, str):
171        try:
172            end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
173        except ValueError:
174            raise ValueError("End date must be a valid YYYY-MM-DD string.")
175    if start_date and end_date and start_date > end_date:
176        raise ValueError("Start date cannot be after end date!")
177
178
179def get_valid_iso_dates(
180    start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None
181) -> Tuple[str, str]:
182    if (start_date and not end_date) or (end_date and not start_date):
183        raise BoostedAPIException("Must provide both start and end dates or neither")
184    elif not end_date and not start_date:
185        end_date = datetime.date.today()
186        start_date = end_date - datetime.timedelta(days=30)  # default to 30 days
187        end_date_iso = end_date.isoformat()
188        start_date_iso = start_date.isoformat()
189    else:
190        if isinstance(start_date, datetime.date):
191            start_date_iso = start_date.isoformat()
192        else:
193            start_date_iso = start_date  # type: ignore
194        if isinstance(end_date, datetime.date):
195            end_date_iso = end_date.isoformat()
196        else:
197            end_date_iso = end_date  # type: ignore
198
199    return start_date_iso, end_date_iso
200
201
202# TODO this function is not called by anyone. Consider to remove it
203def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]:
204    maxrows = 30000
205    periods = ["Y", "M"]
206    for p in periods:
207        sizes = []
208        # mypy can't tell df.index is DatetimeIndex
209        for t in df.index.to_period(p).unique():  # type: ignore[attr-defined]
210            print(t)
211            sizes.append(len(df.loc[str(t)]))
212        if max(sizes) < maxrows:
213            return p
214    # If we got here, no period worked.
215    return None
216
217
218def protoCubeJsonDataToDataFrame(
219    pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list
220) -> pd.DataFrame:
221    pc_list = []
222    for row_idx, row in enumerate(rows):
223        for col_idx, col in enumerate(columns):
224            pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"])
225    df = pd.DataFrame(pc_list)
226    df = df.set_axis([row_name, column_name] + fields, axis="columns")
227    df.set_index([row_name, column_name], inplace=True)
228    return df
229
230
231def getMaxClientVersion() -> Dict[str, Any]:
232    import re
233
234    import requests
235
236    base_url = "https://boosted-downloads.s3.amazonaws.com/"
237    url = f"{base_url}?list-type=2"
238    OK = 200
239    resp = requests.get(url)
240
241    if resp.status_code == OK:
242        groups = [
243            x
244            for x in re.finditer(
245                r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text
246            )
247        ]
248
249        def getVersion(x: Dict[str, Any]) -> Tuple:
250            return x["version"]
251
252        max_info = max(
253            [
254                {
255                    "version": tuple(map(int, group.group(2).split("."))),
256                    "weblink": f"{base_url}{group.group(1)}",
257                }
258                for group in groups
259            ],
260            key=getVersion,
261        )
262        max_info["version"] = str(".".join(map(str, max_info["version"])))
263        return max_info
264    else:
265        raise ValueError(f"Bad Response Code: {resp.status_code}")
logger = <Logger boosted.api.api_util (WARNING)>
def to_camel_case(key: Union[str, NoneType]) -> Union[str, NoneType]:
36def to_camel_case(key: Optional[str]) -> Optional[str]:
37    if not key:
38        return key
39    return f"{key[0].upper()}{key[1:].lower()}"
def infer_dataset_schema( name: str, df: pandas.core.frame.DataFrame, dataset_type: boosted.api.api_type.DataSetType, dataset_subtype: boosted.api.api_type.DataSetSubType = <DataSetSubType.DENSE: 1>, dataset_frequency: boosted.api.api_type.DataSetFrequency = <DataSetFrequency.DAILY: 1>, infer_dataset_report_period: bool = False, infer_from_column_names: bool = False) -> boosted.api.api_type.DataSetConfig:
 42def infer_dataset_schema(
 43    name: str,
 44    df: pd.DataFrame,
 45    dataset_type: DataSetType,
 46    dataset_subtype: DataSetSubType = DataSetSubType.DENSE,
 47    dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY,
 48    infer_dataset_report_period: bool = False,
 49    infer_from_column_names: bool = False,
 50) -> DataSetConfig:
 51    # Sanity checks:
 52    # Time index
 53    if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex:
 54        raise BoostedDataSetSchemaException("Index must be DatetimeIndex.")
 55    if len(df.columns) == 0:
 56        raise BoostedDataSetSchemaException("No feature columns exist.")
 57
 58    datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency)
 59    # More than two columns, one goal, one feature
 60    non_stock_identifier_seen = False
 61
 62    def variable(name: str) -> ColumnConfig:
 63        return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER)
 64
 65    for i, c in enumerate(zip(df.columns.values, df.dtypes.values)):
 66        # process stock identifiers first, ensuring that they all lie grouped
 67        # at the front of the column list
 68        if (
 69            infer_from_column_names
 70            and dataset_type in [DataSetType.STOCK, DataSetType.SECURITIES_DAILY]
 71            and not non_stock_identifier_seen
 72        ):
 73            # make a good effort to match column names with identifiers by stripping out
 74            # punctuation and lowercasing.
 75            sub_role_match = ColumnSubRole.get_match(c[0])
 76            if sub_role_match:
 77                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match)
 78                datasetConfig.addColumn(f)
 79                continue
 80
 81            custon_namespace_variable_role_match = CustomNamespaceVariableRole.get_match(c[0])
 82            if custon_namespace_variable_role_match:
 83                f = ColumnConfig(
 84                    name=c[0],
 85                    role=ColumnRole.VARIABLE,
 86                    custom_namespace_variable_role=custon_namespace_variable_role_match,
 87                )
 88                datasetConfig.addColumn(f)
 89                continue
 90
 91            # end stock identifiers processing sequence as soon as we see something that's not
 92            # an identifier
 93            non_stock_identifier_seen = True
 94
 95        if dataset_type == DataSetType.STRATEGY and i < 1:
 96            # Don't need to add the first column to the schema.
 97            # It is assumed to be the security identifier.
 98            strategy_column_name = c[0]
 99        elif (
100            not infer_from_column_names
101            and dataset_type == DataSetType.STOCK
102            and (
103                (dataset_subtype == DataSetSubType.DENSE and i < 3)
104                or (
105                    dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD]
106                    and i < 5
107                )
108            )
109        ):
110            if i == 0:
111                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN)
112            elif i == 1:
113                f = ColumnConfig(
114                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY
115                )
116            elif i == 2:
117                f = ColumnConfig(
118                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY
119                )
120            elif i == 3:
121                f = ColumnConfig(
122                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE
123                )
124            elif i == 4:
125                if infer_dataset_report_period:
126                    f = variable(c[0])
127                else:
128                    f = ColumnConfig(
129                        name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD
130                    )
131            datasetConfig.addColumn(f)
132        elif dataset_type == DataSetType.SECURITIES_DAILY:
133            raise NotImplementedError(
134                f"Can not infer order for custom security daily dataset "
135                + "without exact column name match as hints! Please see DataSetConfig to create "
136                + f"your own dataset schema. (column = {c[0]})"
137            )
138        elif pd.api.types.is_numeric_dtype(c[1]):
139            # Is numeric
140            f = variable(c[0])
141            datasetConfig.addColumn(f)
142        else:
143            logger.info(
144                "Only numeric types are supported now."
145                '  IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1])
146            )
147    if dataset_type == DataSetType.STRATEGY:
148        strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist()))
149        for strategy in strategies:
150            datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy))
151    return datasetConfig
def validate_start_and_end_dates( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> None:
154def validate_start_and_end_dates(
155    start_date: Optional[BoostedDate], end_date: Optional[BoostedDate]
156) -> None:
157    """Validate if start_date is earlier than end_date
158
159    Args:
160        start_date (BoostedDate): a date object or a valid ISO date string
161        end_date (BoostedDate): a date object or a valid ISO date string
162
163    Raises:
164        ValueError: the arguments are not date object or a valid ISO date string
165    """
166    if isinstance(start_date, str):
167        try:
168            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
169        except ValueError:
170            raise ValueError("Start date must be a valid YYYY-MM-DD string.")
171    if isinstance(end_date, str):
172        try:
173            end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
174        except ValueError:
175            raise ValueError("End date must be a valid YYYY-MM-DD string.")
176    if start_date and end_date and start_date > end_date:
177        raise ValueError("Start date cannot be after end date!")

Validate if start_date is earlier than end_date

Args: start_date (BoostedDate): a date object or a valid ISO date string end_date (BoostedDate): a date object or a valid ISO date string

Raises: ValueError: the arguments are not date object or a valid ISO date string

def get_valid_iso_dates( start_date: Union[datetime.date, str, NoneType] = None, end_date: Union[datetime.date, str, NoneType] = None) -> Tuple[str, str]:
180def get_valid_iso_dates(
181    start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None
182) -> Tuple[str, str]:
183    if (start_date and not end_date) or (end_date and not start_date):
184        raise BoostedAPIException("Must provide both start and end dates or neither")
185    elif not end_date and not start_date:
186        end_date = datetime.date.today()
187        start_date = end_date - datetime.timedelta(days=30)  # default to 30 days
188        end_date_iso = end_date.isoformat()
189        start_date_iso = start_date.isoformat()
190    else:
191        if isinstance(start_date, datetime.date):
192            start_date_iso = start_date.isoformat()
193        else:
194            start_date_iso = start_date  # type: ignore
195        if isinstance(end_date, datetime.date):
196            end_date_iso = end_date.isoformat()
197        else:
198            end_date_iso = end_date  # type: ignore
199
200    return start_date_iso, end_date_iso
def estimateUploadBatchPeriod(df: pandas.core.frame.DataFrame) -> Union[str, NoneType]:
204def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]:
205    maxrows = 30000
206    periods = ["Y", "M"]
207    for p in periods:
208        sizes = []
209        # mypy can't tell df.index is DatetimeIndex
210        for t in df.index.to_period(p).unique():  # type: ignore[attr-defined]
211            print(t)
212            sizes.append(len(df.loc[str(t)]))
213        if max(sizes) < maxrows:
214            return p
215    # If we got here, no period worked.
216    return None
def protoCubeJsonDataToDataFrame( pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list) -> pandas.core.frame.DataFrame:
219def protoCubeJsonDataToDataFrame(
220    pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list
221) -> pd.DataFrame:
222    pc_list = []
223    for row_idx, row in enumerate(rows):
224        for col_idx, col in enumerate(columns):
225            pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"])
226    df = pd.DataFrame(pc_list)
227    df = df.set_axis([row_name, column_name] + fields, axis="columns")
228    df.set_index([row_name, column_name], inplace=True)
229    return df
def getMaxClientVersion() -> Dict[str, Any]:
232def getMaxClientVersion() -> Dict[str, Any]:
233    import re
234
235    import requests
236
237    base_url = "https://boosted-downloads.s3.amazonaws.com/"
238    url = f"{base_url}?list-type=2"
239    OK = 200
240    resp = requests.get(url)
241
242    if resp.status_code == OK:
243        groups = [
244            x
245            for x in re.finditer(
246                r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text
247            )
248        ]
249
250        def getVersion(x: Dict[str, Any]) -> Tuple:
251            return x["version"]
252
253        max_info = max(
254            [
255                {
256                    "version": tuple(map(int, group.group(2).split("."))),
257                    "weblink": f"{base_url}{group.group(1)}",
258                }
259                for group in groups
260            ],
261            key=getVersion,
262        )
263        max_info["version"] = str(".".join(map(str, max_info["version"])))
264        return max_info
265    else:
266        raise ValueError(f"Bad Response Code: {resp.status_code}")