boosted.api.api_util

  1# Copyright (C) 2020 Gradient Boosted Investments, Inc. - All Rights Reserved
  2
  3import datetime
  4import logging
  5from collections import OrderedDict
  6from typing import Any, Dict, Optional, Tuple
  7
  8import pandas as pd
  9from boosted.api.api_type import (
 10    BoostedAPIException,
 11    BoostedDataSetSchemaException,
 12    BoostedDate,
 13    ColumnConfig,
 14    ColumnRole,
 15    ColumnSubRole,
 16    ColumnValueType,
 17    DataSetConfig,
 18    DataSetFrequency,
 19    DataSetSubType,
 20    DataSetType,
 21    StrategyConfig,
 22)
 23
 24# import numpy as np
 25
 26
 27logger = logging.getLogger("boosted.api.api_util")
 28
 29
 30def infer_dataset_schema(
 31    name: str,
 32    df: pd.DataFrame,
 33    dataset_type: DataSetType,
 34    dataset_subtype: DataSetSubType = DataSetSubType.DENSE,
 35    dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY,
 36    infer_dataset_report_period: bool = False,
 37    infer_from_column_names: bool = False,
 38) -> DataSetConfig:
 39    # Sanity checks:
 40    # Time index
 41    if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex:
 42        raise BoostedDataSetSchemaException("Index must be DatetimeIndex.")
 43    if len(df.columns) == 0:
 44        raise BoostedDataSetSchemaException("No feature columns exist.")
 45
 46    datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency)
 47    # More than two columns, one goal, one feature
 48    non_stock_identifier_seen = False
 49
 50    def variable(name: str) -> ColumnConfig:
 51        return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER)
 52
 53    for i, c in enumerate(zip(df.columns.values, df.dtypes.values)):
 54        # process stock identifiers first, ensuring that they all lie grouped
 55        # at the front of the column list
 56        if (
 57            infer_from_column_names
 58            and dataset_type == DataSetType.STOCK
 59            and not non_stock_identifier_seen
 60        ):
 61            # make a good effort to match column names with identifiers by stripping out
 62            # punctuation and lowercasing.
 63            sub_role_match = ColumnSubRole.get_match(c[0])
 64            if sub_role_match:
 65                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match)
 66                datasetConfig.addColumn(f)
 67                continue
 68            # end stock identifiers processing sequence as soon as we see something that's not
 69            # an identifier
 70            non_stock_identifier_seen = True
 71
 72        if dataset_type == DataSetType.STRATEGY and i < 1:
 73            # Don't need to add the first column to the schema.
 74            # It is assumed to be the security identifier.
 75            strategy_column_name = c[0]
 76        elif (
 77            not infer_from_column_names
 78            and dataset_type == DataSetType.STOCK
 79            and (
 80                (dataset_subtype == DataSetSubType.DENSE and i < 3)
 81                or (
 82                    dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD]
 83                    and i < 5
 84                )
 85            )
 86        ):
 87            if i == 0:
 88                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN)
 89            elif i == 1:
 90                f = ColumnConfig(
 91                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY
 92                )
 93            elif i == 2:
 94                f = ColumnConfig(
 95                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY
 96                )
 97            elif i == 3:
 98                f = ColumnConfig(
 99                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE
100                )
101            elif i == 4:
102                if infer_dataset_report_period:
103                    f = variable(c[0])
104                else:
105                    f = ColumnConfig(
106                        name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD
107                    )
108            datasetConfig.addColumn(f)
109        elif pd.api.types.is_numeric_dtype(c[1]):
110            # Is numeric
111            f = variable(c[0])
112            datasetConfig.addColumn(f)
113        else:
114            logger.info(
115                "Only numeric types are supported now."
116                '  IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1])
117            )
118    if dataset_type == DataSetType.STRATEGY:
119        strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist()))
120        for strategy in strategies:
121            datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy))
122    return datasetConfig
123
124
125def validate_start_and_end_dates(
126    start_date: Optional[BoostedDate], end_date: Optional[BoostedDate]
127) -> None:
128    """Validate if start_date is earlier than end_date
129
130    Args:
131        start_date (BoostedDate): a date object or a valid ISO date string
132        end_date (BoostedDate): a date object or a valid ISO date string
133
134    Raises:
135        ValueError: the arguments are not date object or a valid ISO date string
136    """
137    if isinstance(start_date, str):
138        try:
139            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
140        except ValueError:
141            raise ValueError("Start date must be a valid YYYY-MM-DD string.")
142    if isinstance(end_date, str):
143        try:
144            end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
145        except ValueError:
146            raise ValueError("End date must be a valid YYYY-MM-DD string.")
147    if start_date and end_date and start_date > end_date:
148        raise ValueError("Start date cannot be after end date!")
149
150
151def get_valid_iso_dates(
152    start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None
153) -> Tuple[str, str]:
154    if (start_date and not end_date) or (end_date and not start_date):
155        raise BoostedAPIException("Must provide both start and end dates or neither")
156    elif not end_date and not start_date:
157        end_date = datetime.date.today()
158        start_date = end_date - datetime.timedelta(days=30)  # default to 30 days
159        end_date_iso = end_date.isoformat()
160        start_date_iso = start_date.isoformat()
161    else:
162        if isinstance(start_date, datetime.date):
163            start_date_iso = start_date.isoformat()
164        else:
165            start_date_iso = start_date  # type: ignore
166        if isinstance(end_date, datetime.date):
167            end_date_iso = end_date.isoformat()
168        else:
169            end_date_iso = end_date  # type: ignore
170
171    return start_date_iso, end_date_iso
172
173
174# TODO this function is not called by anyone. Consider to remove it
175def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]:
176    maxrows = 30000
177    periods = ["Y", "M"]
178    for p in periods:
179        sizes = []
180        # mypy can't tell df.index is DatetimeIndex
181        for t in df.index.to_period(p).unique():  # type: ignore[attr-defined]
182            print(t)
183            sizes.append(len(df.loc[str(t)]))
184        if max(sizes) < maxrows:
185            return p
186    # If we got here, no period worked.
187    return None
188
189
190def protoCubeJsonDataToDataFrame(
191    pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list
192) -> pd.DataFrame:
193    pc_list = []
194    for row_idx, row in enumerate(rows):
195        for col_idx, col in enumerate(columns):
196            pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"])
197    df = pd.DataFrame(pc_list)
198    df = df.set_axis([row_name, column_name] + fields, axis="columns")
199    df.set_index([row_name, column_name], inplace=True)
200    return df
201
202
203def getMaxClientVersion() -> Dict[str, Any]:
204    import re
205
206    import requests
207
208    base_url = "https://boosted-downloads.s3.amazonaws.com/"
209    url = f"{base_url}?list-type=2"
210    OK = 200
211    resp = requests.get(url)
212
213    if resp.status_code == OK:
214        groups = [
215            x
216            for x in re.finditer(
217                r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text
218            )
219        ]
220
221        def getVersion(x: Dict[str, Any]) -> Tuple:
222            return x["version"]
223
224        max_info = max(
225            [
226                {
227                    "version": tuple(map(int, group.group(2).split("."))),
228                    "weblink": f"{base_url}{group.group(1)}",
229                }
230                for group in groups
231            ],
232            key=getVersion,
233        )
234        max_info["version"] = str(".".join(map(str, max_info["version"])))
235        return max_info
236    else:
237        raise ValueError(f"Bad Response Code: {resp.status_code}")
logger = <Logger boosted.api.api_util (WARNING)>
def infer_dataset_schema( name: str, df: pandas.core.frame.DataFrame, dataset_type: boosted.api.api_type.DataSetType, dataset_subtype: boosted.api.api_type.DataSetSubType = <DataSetSubType.DENSE: 1>, dataset_frequency: boosted.api.api_type.DataSetFrequency = <DataSetFrequency.DAILY: 1>, infer_dataset_report_period: bool = False, infer_from_column_names: bool = False) -> boosted.api.api_type.DataSetConfig:
 31def infer_dataset_schema(
 32    name: str,
 33    df: pd.DataFrame,
 34    dataset_type: DataSetType,
 35    dataset_subtype: DataSetSubType = DataSetSubType.DENSE,
 36    dataset_frequency: DataSetFrequency = DataSetFrequency.DAILY,
 37    infer_dataset_report_period: bool = False,
 38    infer_from_column_names: bool = False,
 39) -> DataSetConfig:
 40    # Sanity checks:
 41    # Time index
 42    if type(df.index) != pd.core.indexes.datetimes.DatetimeIndex:
 43        raise BoostedDataSetSchemaException("Index must be DatetimeIndex.")
 44    if len(df.columns) == 0:
 45        raise BoostedDataSetSchemaException("No feature columns exist.")
 46
 47    datasetConfig = DataSetConfig(name, dataset_type, dataset_subtype, dataset_frequency)
 48    # More than two columns, one goal, one feature
 49    non_stock_identifier_seen = False
 50
 51    def variable(name: str) -> ColumnConfig:
 52        return ColumnConfig(name=name, role=ColumnRole.VARIABLE, value_type=ColumnValueType.NUMBER)
 53
 54    for i, c in enumerate(zip(df.columns.values, df.dtypes.values)):
 55        # process stock identifiers first, ensuring that they all lie grouped
 56        # at the front of the column list
 57        if (
 58            infer_from_column_names
 59            and dataset_type == DataSetType.STOCK
 60            and not non_stock_identifier_seen
 61        ):
 62            # make a good effort to match column names with identifiers by stripping out
 63            # punctuation and lowercasing.
 64            sub_role_match = ColumnSubRole.get_match(c[0])
 65            if sub_role_match:
 66                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=sub_role_match)
 67                datasetConfig.addColumn(f)
 68                continue
 69            # end stock identifiers processing sequence as soon as we see something that's not
 70            # an identifier
 71            non_stock_identifier_seen = True
 72
 73        if dataset_type == DataSetType.STRATEGY and i < 1:
 74            # Don't need to add the first column to the schema.
 75            # It is assumed to be the security identifier.
 76            strategy_column_name = c[0]
 77        elif (
 78            not infer_from_column_names
 79            and dataset_type == DataSetType.STOCK
 80            and (
 81                (dataset_subtype == DataSetSubType.DENSE and i < 3)
 82                or (
 83                    dataset_subtype in [DataSetSubType.SPARSE_HIST, DataSetSubType.SPARSE_FWD]
 84                    and i < 5
 85                )
 86            )
 87        ):
 88            if i == 0:
 89                f = ColumnConfig(name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.ISIN)
 90            elif i == 1:
 91                f = ColumnConfig(
 92                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.COUNTRY
 93                )
 94            elif i == 2:
 95                f = ColumnConfig(
 96                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.CURRENCY
 97                )
 98            elif i == 3:
 99                f = ColumnConfig(
100                    name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_DATE
101                )
102            elif i == 4:
103                if infer_dataset_report_period:
104                    f = variable(c[0])
105                else:
106                    f = ColumnConfig(
107                        name=c[0], role=ColumnRole.IDENTIFIER, sub_role=ColumnSubRole.REPORT_PERIOD
108                    )
109            datasetConfig.addColumn(f)
110        elif pd.api.types.is_numeric_dtype(c[1]):
111            # Is numeric
112            f = variable(c[0])
113            datasetConfig.addColumn(f)
114        else:
115            logger.info(
116                "Only numeric types are supported now."
117                '  IGNORING THIS COLUMN. Name = "{0}", type = {1}.'.format(c[0], c[1])
118            )
119    if dataset_type == DataSetType.STRATEGY:
120        strategies = list(OrderedDict.fromkeys(df[strategy_column_name].tolist()))
121        for strategy in strategies:
122            datasetConfig.addStrategy(StrategyConfig(name=strategy, source_name=strategy))
123    return datasetConfig
def validate_start_and_end_dates( start_date: Union[datetime.date, str, NoneType], end_date: Union[datetime.date, str, NoneType]) -> None:
126def validate_start_and_end_dates(
127    start_date: Optional[BoostedDate], end_date: Optional[BoostedDate]
128) -> None:
129    """Validate if start_date is earlier than end_date
130
131    Args:
132        start_date (BoostedDate): a date object or a valid ISO date string
133        end_date (BoostedDate): a date object or a valid ISO date string
134
135    Raises:
136        ValueError: the arguments are not date object or a valid ISO date string
137    """
138    if isinstance(start_date, str):
139        try:
140            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
141        except ValueError:
142            raise ValueError("Start date must be a valid YYYY-MM-DD string.")
143    if isinstance(end_date, str):
144        try:
145            end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
146        except ValueError:
147            raise ValueError("End date must be a valid YYYY-MM-DD string.")
148    if start_date and end_date and start_date > end_date:
149        raise ValueError("Start date cannot be after end date!")

Validate if start_date is earlier than end_date

Args: start_date (BoostedDate): a date object or a valid ISO date string end_date (BoostedDate): a date object or a valid ISO date string

Raises: ValueError: the arguments are not date object or a valid ISO date string

def get_valid_iso_dates( start_date: Union[datetime.date, str, NoneType] = None, end_date: Union[datetime.date, str, NoneType] = None) -> Tuple[str, str]:
152def get_valid_iso_dates(
153    start_date: Optional[BoostedDate] = None, end_date: Optional[BoostedDate] = None
154) -> Tuple[str, str]:
155    if (start_date and not end_date) or (end_date and not start_date):
156        raise BoostedAPIException("Must provide both start and end dates or neither")
157    elif not end_date and not start_date:
158        end_date = datetime.date.today()
159        start_date = end_date - datetime.timedelta(days=30)  # default to 30 days
160        end_date_iso = end_date.isoformat()
161        start_date_iso = start_date.isoformat()
162    else:
163        if isinstance(start_date, datetime.date):
164            start_date_iso = start_date.isoformat()
165        else:
166            start_date_iso = start_date  # type: ignore
167        if isinstance(end_date, datetime.date):
168            end_date_iso = end_date.isoformat()
169        else:
170            end_date_iso = end_date  # type: ignore
171
172    return start_date_iso, end_date_iso
def estimateUploadBatchPeriod(df: pandas.core.frame.DataFrame) -> Union[str, NoneType]:
176def estimateUploadBatchPeriod(df: pd.DataFrame) -> Optional[str]:
177    maxrows = 30000
178    periods = ["Y", "M"]
179    for p in periods:
180        sizes = []
181        # mypy can't tell df.index is DatetimeIndex
182        for t in df.index.to_period(p).unique():  # type: ignore[attr-defined]
183            print(t)
184            sizes.append(len(df.loc[str(t)]))
185        if max(sizes) < maxrows:
186            return p
187    # If we got here, no period worked.
188    return None
def protoCubeJsonDataToDataFrame( pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list) -> pandas.core.frame.DataFrame:
191def protoCubeJsonDataToDataFrame(
192    pf: list, row_name: str, rows: list, column_name: str, columns: list, fields: list
193) -> pd.DataFrame:
194    pc_list = []
195    for row_idx, row in enumerate(rows):
196        for col_idx, col in enumerate(columns):
197            pc_list.append([row, col] + pf[row_idx]["columns"][col_idx]["fields"])
198    df = pd.DataFrame(pc_list)
199    df = df.set_axis([row_name, column_name] + fields, axis="columns")
200    df.set_index([row_name, column_name], inplace=True)
201    return df
def getMaxClientVersion() -> Dict[str, Any]:
204def getMaxClientVersion() -> Dict[str, Any]:
205    import re
206
207    import requests
208
209    base_url = "https://boosted-downloads.s3.amazonaws.com/"
210    url = f"{base_url}?list-type=2"
211    OK = 200
212    resp = requests.get(url)
213
214    if resp.status_code == OK:
215        groups = [
216            x
217            for x in re.finditer(
218                r"<Key>(boosted_insights_python_client-([0-9\.]+)-py3-[^<]+)", resp.text
219            )
220        ]
221
222        def getVersion(x: Dict[str, Any]) -> Tuple:
223            return x["version"]
224
225        max_info = max(
226            [
227                {
228                    "version": tuple(map(int, group.group(2).split("."))),
229                    "weblink": f"{base_url}{group.group(1)}",
230                }
231                for group in groups
232            ],
233            key=getVersion,
234        )
235        max_info["version"] = str(".".join(map(str, max_info["version"])))
236        return max_info
237    else:
238        raise ValueError(f"Bad Response Code: {resp.status_code}")