Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessing Data is missng ,raise key error. automl self was created data that is not repreat behaviour. #1278

Open
731315163 opened this issue Feb 13, 2024 · 1 comment

Comments

@731315163
Copy link

731315163 commented Feb 13, 2024

= X[self.regressors]
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/frame.py", line 3899, in getitem
indexer = self.columns._get_indexer_strict(key, "columns")[1]
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6115, in _get_indexer_strict
self._raise_if_missing(keyarr, indexer, axis_name)
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6179, in _raise_if_missing
raise KeyError(f"{not_found} not in index")
KeyError: "['index_sin1', 'index_sin6', 'index_cos5', 'index_second_cos', 'index_sin2', 'index_second_sin', 'index_cos4', 'index_minute_cos', 'index_dayofweek_sin', 'index_cos1', 'index_month_cos', 'index_dayofyear_sin', 'index_sin3', 'index_sin5', 'index_cos3', 'index_hour_sin', 'index_cos6', 'index_hour_cos', 'index_month_sin', 'index_minute_sin', 'index_sin4', 'index_cos2', 'index_quarter_sin', 'index_quarter_cos', 'index_dayofyear_cos', 'index_dayofweek_cos'] not in index"

import pandas as pd
from flaml.automl import AutoML, logger_formatter
from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner
import numpy as np
from libX import PreprocessingData as data
import pickle
import os.path as path

savepath = data.JoinCurDir("automl.pkl")
datename = "DATE"
openen = "Open"


def train():

    traindata = data.get_rawdata_df(["DATE", "WM2NS", "UNRATE", "Open"])
    traindata.reset_index(inplace=True)
    traindata[datename] = pd.to_datetime(traindata[datename], format="%Y-%m-%d")
    traindata[openen] = pd.to_numeric(traindata[openen])
    traindata[datename] = traindata[datename].asfreq("D")
    traindata.set_index(keys=datename, inplace=True)

    print(traindata.head(3))
    # print(trainx[0:2])
    # print(trainy[0:2])
    automl = AutoML()
    automl_settings = {
        "task": "ts_forecast",
        "time_budget": 60 * 10,
        # "estimator_list": ["prophet", "arima", "sarimax"],
        "log_file_name": "ts_forecast.log",
        "period": 14,
    }

    automl.fit(
        dataframe=traindata,  # a single column of timestamp
        label=openen,  # value for each timestamp
        # time horizon to forecast, e.g., 12 months
       
        # split_type="time",
       
        ensemble=True,
       
        early_stop=True,
      
        # skip_transform=True,
        **automl_settings
    )
    automl.save_best_config(savepath)
    with open(savepath, "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    return automl


def findmodel():
    if path.exists(savepath):
        # At prediction time
        with open(savepath, "rb") as f:
            return pickle.load(f)
    else:
        return train()


datetimetest = pd.DataFrame(
    {
        datename: [1, 2, 3],
        "WM2NS": [20966.057142857142, 20963.77142857143, 20961.485714285714],
        "UNRATE": [3.7, 3.7, 3.7],
    }
)
automl = findmodel()
datetimetest.set_index(datename)
pred = automl.predict(datetimetest)
print(pred)
@thinkall
Copy link
Collaborator

Hi @731315163 , the error message is related to pandas and the data itself. Can you check the index stuff works w/o involving flaml?
Besides, you're using the same savepath for both best config and the automl instance.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants