Skip to content

Commit

Permalink
modular unit tests
Browse files Browse the repository at this point in the history
serengil committed Dec 25, 2023
1 parent c9d1d1e commit 33c0b2c
Showing 25 changed files with 549 additions and 426 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -40,7 +40,7 @@ jobs:
- name: Test with pytest
run: |
cd tests
python global-unit-test.py
python -m pytest . -s --disable-warnings
linting:
needs: unit-tests

2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
test:
cd tests && python global-unit-test.py
cd tests && python -m pytest . -s --disable-warnings

lint:
python -m pylint chefboost/ --fail-under=10
81 changes: 58 additions & 23 deletions chefboost/Chefboost.py
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@ def fit(
config: Optional[dict] = None,
target_label: str = "Decision",
validation_df: Optional[pd.DataFrame] = None,
silent: bool = False,
) -> Dict[str, Any]:
"""
Build (a) decision tree model(s)
@@ -55,6 +56,9 @@ def fit(
if nothing is passed to validation data frame, then the function validates
built trees for training data frame
silent (bool): set this to True if you do not want to see
any informative logs
Returns:
chefboost model
"""
@@ -139,7 +143,8 @@ def fit(

if enableParallelism == True:
num_cores = config["num_cores"]
logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running")
if silent is False:
logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running")

from multiprocessing import set_start_method, freeze_support

@@ -169,7 +174,8 @@ def fit(
config["algorithm"] = "Regression"

if enableGBM == True:
logger.info("Gradient Boosting Machines...")
if silent is False:
logger.info("Gradient Boosting Machines...")
algorithm = "Regression"
config["algorithm"] = "Regression"

@@ -184,7 +190,8 @@ def fit(

# -------------------------

logger.info(f"{algorithm} tree is going to be built...")
if silent is False:
logger.info(f"{algorithm} tree is going to be built...")

# initialize a dictionary. this is going to be used to check features numeric or nominal.
# numeric features should be transformed to nominal values based on scales.
@@ -212,7 +219,13 @@ def fit(

if enableAdaboost == True:
trees, alphas = adaboost_clf.apply(
df, config, header, dataset_features, validation_df=validation_df, process_id=process_id
df,
config,
header,
dataset_features,
validation_df=validation_df,
process_id=process_id,
silent=silent,
)

elif enableGBM == True:
@@ -224,6 +237,7 @@ def fit(
dataset_features,
validation_df=validation_df,
process_id=process_id,
silent=silent,
)
# classification = True

@@ -235,12 +249,19 @@ def fit(
dataset_features,
validation_df=validation_df,
process_id=process_id,
silent=silent,
)
# classification = False

elif enableRandomForest == True:
trees = randomforest.apply(
df, config, header, dataset_features, validation_df=validation_df, process_id=process_id
df,
config,
header,
dataset_features,
validation_df=validation_df,
process_id=process_id,
silent=silent,
)
else: # regular decision tree building
root = 1
@@ -264,22 +285,23 @@ def fit(
main_process_id=process_id,
)

logger.info("-------------------------")
logger.info(f"finished in {time.time() - begin} seconds")
if silent is False:
logger.info("-------------------------")
logger.info(f"finished in {time.time() - begin} seconds")

obj = {"trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values}

# -----------------------------------------

# train set accuracy
df = base_df.copy()
evaluate(obj, df, task="train")
trainset_evaluation = evaluate(obj, df, task="train", silent=silent)
obj["evaluation"] = {"train": trainset_evaluation}

# validation set accuracy
if isinstance(validation_df, pd.DataFrame):
evaluate(obj, validation_df, task="validation")

# -----------------------------------------
validationset_evaluation = evaluate(obj, validation_df, task="validation", silent=silent)
obj["evaluation"]["validation"] = validationset_evaluation

return obj

@@ -455,31 +477,38 @@ def restoreTree(module_name) -> Any:
return functions.restoreTree(module_name)


def feature_importance(rules: Union[str, list]) -> pd.DataFrame:
def feature_importance(rules: Union[str, list], silent: bool = False) -> pd.DataFrame:
"""
Show the feature importance values of a built model
Args:
rules (str or list): e.g. decision_rules = "outputs/rules/rules.py"
rules (str or list): e.g. decision_rules = "outputs/rules/rules.py"
or this could be retrieved from built model as shown below.
decision_rules = []
for tree in model["trees"]:
rule = .__dict__["__spec__"].origin
decision_rules.append(rule)
```python
decision_rules = []
for tree in model["trees"]:
rule = .__dict__["__spec__"].origin
decision_rules.append(rule)
```
silent (bool): set this to True if you do want to see
any informative logs.
Returns:
feature importance (pd.DataFrame)
"""

if not isinstance(rules, list):
rules = [rules]
logger.info(f"rules: {rules}")

if silent is False:
logger.info(f"rules: {rules}")

# -----------------------------

dfs = []

for rule in rules:
logger.info("Decision rule: {rule}")
if silent is False:
logger.info(f"Decision rule: {rule}")

with open(rule, "r", encoding="UTF-8") as file:
lines = file.readlines()
@@ -564,17 +593,23 @@ def feature_importance(rules: Union[str, list]) -> pd.DataFrame:


def evaluate(
model: dict, df: pd.DataFrame, target_label: str = "Decision", task: str = "test"
) -> None:
model: dict,
df: pd.DataFrame,
target_label: str = "Decision",
task: str = "test",
silent: bool = False,
) -> dict:
"""
Evaluate the performance of a built model on a data set
Args:
model (dict): built model which is the output of fit function
df (pandas data frame): data frame you would like to evaluate
target_label (str): target label
task (string): set this to train, validation or test
silent (bool): set this to True if you do not want to see
any informative logs
Returns:
None
evaluation results (dict)
"""

# --------------------------
@@ -598,4 +633,4 @@ def evaluate(
df["Decision"] = df["Decision"].astype(str)
df["Prediction"] = df["Prediction"].astype(str)

cb_eval.evaluate(df, task=task)
return cb_eval.evaluate(df, task=task, silent=silent)
90 changes: 67 additions & 23 deletions chefboost/commons/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
import math
import pandas as pd
from chefboost.commons.logger import Logger

# pylint: disable=broad-except

logger = Logger(module="chefboost/commons/evaluate.py")


def evaluate(df, task="train"):
def evaluate(df: pd.DataFrame, task: str = "train", silent: bool = False) -> dict:
"""
Evaluate results
Args:
df (pd.DataFrame): data frame
task (str): train, test
silent (bool): set this to True if you do not want to
see any informative logs
Returns:
evaluation results (dict)
"""
if df["Decision"].dtypes == "object":
problem_type = "classification"
else:
problem_type = "regression"

# -------------------------------------

evaluation_results = {}
instances = df.shape[0]

logger.info("-------------------------")
logger.info(f"Evaluate {task} set")
logger.info("-------------------------")
if silent is False:
logger.info("-------------------------")
logger.info(f"Evaluate {task} set")
logger.info("-------------------------")

if problem_type == "classification":
idx = df[df["Prediction"] == df["Decision"]].index
accuracy = 100 * len(idx) / df.shape[0]
logger.info(f"Accuracy: {accuracy}% on {instances} instances")
if silent is False:
logger.info(f"Accuracy: {accuracy}% on {instances} instances")

evaluation_results["Accuracy"] = accuracy
evaluation_results["Instances"] = instances
# -----------------------------

predictions = df.Prediction.values
@@ -48,8 +62,12 @@ def evaluate(df, task="train"):
confusion_row.append(item)
confusion_matrix.append(confusion_row)

logger.info(f"Labels: {labels}")
logger.info(f"Confusion matrix: {confusion_matrix}")
if silent is False:
logger.info(f"Labels: {labels}")
logger.info(f"Confusion matrix: {confusion_matrix}")

evaluation_results["Labels"] = labels
evaluation_results["Confusion matrix"] = confusion_matrix

# -----------------------------
# precision and recall
@@ -79,11 +97,19 @@ def evaluate(df, task="train"):
accuracy = round(100 * (tp + tn) / (tp + tn + fp + fn + epsilon), 4)

if len(labels) >= 3:
logger.info(f"Decision {decision_class}")
logger.info(f"Accuray: {accuracy}")
if silent is False:
logger.info(f"Decision {decision_class}")
logger.info(f"Accuracy: {accuracy}")

evaluation_results[f"Decision {decision_class}'s Accuracy"] = accuracy

logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%")
logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
if silent is False:
logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%")
logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

evaluation_results["Precision"] = precision
evaluation_results["Recall"] = recall
evaluation_results["F1"] = f1_score

if len(labels) < 3:
break
@@ -99,13 +125,17 @@ def evaluate(df, task="train"):

if instances > 0:
mae = df["Absolute_Error"].sum() / instances
logger.info(f"MAE: {mae}")

mse = df["Absolute_Error_Squared"].sum() / instances
logger.info(f"MSE: {mse}")

rmse = math.sqrt(mse)
logger.info(f"RMSE: {rmse}")

evaluation_results["MAE"] = mae
evaluation_results["MSE"] = mse
evaluation_results["RMSE"] = rmse

if silent is False:
logger.info(f"MAE: {mae}")
logger.info(f"MSE: {mse}")
logger.info(f"RMSE: {rmse}")

rae = 0
rrse = 0
@@ -122,12 +152,26 @@ def evaluate(df, task="train"):
except Exception as err:
logger.error(str(err))

logger.info(f"RAE: {rae}")
logger.info(f"RRSE {rrse}")
if silent is False:
logger.info(f"RAE: {rae}")
logger.info(f"RRSE {rrse}")

evaluation_results["RAE"] = rae
evaluation_results["RRSE"] = rrse

mean = df["Decision"].mean()
logger.info(f"Mean: {mean}")

if silent is False:
logger.info(f"Mean: {mean}")

evaluation_results["Mean"] = mean

if mean > 0:
logger.info(f"MAE / Mean: {100 * mae / mean}%")
logger.info(f"RMSE / Mean: {100 * rmse / mean}%")
if silent is False:
logger.info(f"MAE / Mean: {100 * mae / mean}%")
logger.info(f"RMSE / Mean: {100 * rmse / mean}%")

evaluation_results["MAE / Mean"] = 100 * mae / mean
evaluation_results["RMSE / Mean"] = 100 * rmse / mean

return evaluation_results
18 changes: 6 additions & 12 deletions chefboost/training/Training.py
Original file line number Diff line number Diff line change
@@ -510,16 +510,10 @@ def buildDecisionTree(
# add else condition in the decision tree

if df.Decision.dtypes == "object": # classification
pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()

if pd.__version__.split(".")[0] == "1":
pivot = pivot.rename(columns={"Decision": "Instances", "index": "Decision"})
else: # if pd.__version__.split(".")[0] == "2":
pivot = pivot.rename(columns={"Decision": "Instances", "count": "Decision"})

pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index()

else_decision = f"return '{pivot.iloc[0].Decision}'"
pivot = pd.DataFrame(subdataset.Decision.value_counts()).sort_values(
by=["count"], ascending=False
)
else_decision = f"return '{str(pivot.iloc[0].name)}'"

if enableParallelism != True:
functions.storeRule(file, (functions.formatRule(root), "else:"))
@@ -669,7 +663,7 @@ def buildDecisionTree(
# this is reguler decision tree. find accuracy here.

module_name = "outputs/rules/rules"
myrules = load_module(module_name) # rules0
myrules = load_module(module_name) # rules0
models.append(myrules)

return models
@@ -682,7 +676,7 @@ def findPrediction(row):
params.append(row[j])

module_name = "outputs/rules/rules"
myrules = load_module(module_name) # rules0
myrules = load_module(module_name) # rules0

prediction = myrules.findDecision(params)
return prediction
11 changes: 6 additions & 5 deletions chefboost/tuning/adaboost.py
Original file line number Diff line number Diff line change
@@ -31,7 +31,9 @@ def findPrediction(row):
return prediction


def apply(df, config, header, dataset_features, validation_df=None, process_id=None):
def apply(
df, config, header, dataset_features, validation_df=None, process_id=None, silent: bool = False
):
models = []
alphas = []

@@ -53,8 +55,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N
best_epoch_idx = 0
best_epoch_value = 1000000

# for i in range(0, num_of_weak_classifier):
pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting")
pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting", disable=silent)
for i in pbar:
worksheet["Decision"] = worksheet["Weight"] * worksheet["Decision"]

@@ -139,8 +140,8 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N
pbar.set_description(f"Epoch {i + 1}. Loss: {mae}. Process: ")

# ------------------------------

logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score")
if silent is False:
logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score")

models = models[0 : best_epoch_idx + 1]
alphas = alphas[0 : best_epoch_idx + 1]
53 changes: 36 additions & 17 deletions chefboost/tuning/gbm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gc
from typing import Optional, Union

import pandas as pd
import numpy as np
@@ -14,7 +15,7 @@
logger = Logger(module="chefboost/tuning/gbm.py")


def findPrediction(row):
def findPrediction(row: pd.Series) -> Union[str, float]:
epoch = row["Epoch"]
row = row.drop(labels=["Epoch"])
columns = row.shape[0]
@@ -32,7 +33,15 @@ def findPrediction(row):
return prediction


def regressor(df, config, header, dataset_features, validation_df=None, process_id=None):
def regressor(
df: pd.DataFrame,
config: dict,
header: str,
dataset_features: dict,
validation_df: Optional[pd.DataFrame] = None,
process_id: Optional[int] = None,
silent: bool = False,
) -> list:
models = []

# we will update decisions in every epoch, this will be used to restore
@@ -69,10 +78,7 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_
best_epoch_idx = 0
best_epoch_loss = 1000000

pbar = tqdm(range(1, epochs + 1), desc="Boosting")

# for index in range(1,epochs+1):
# for index in tqdm(range(1,epochs+1), desc='Boosting'):
pbar = tqdm(range(1, epochs + 1), desc="Boosting", disable=silent)
for index in pbar:
logger.debug(f"epoch {index} - ")
loss = 0
@@ -155,22 +161,33 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_

# ---------------------------------

logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value")
if silent is False:
logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value")
models = models[0:best_epoch_idx]
config["epochs"] = best_epoch_idx

logger.info(
f"MSE of {num_of_instances} instances are boosted from {boosted_from}"
f"to {best_epoch_loss} in {epochs} epochs"
)
if silent is False:
logger.info(
f"MSE of {num_of_instances} instances are boosted from {boosted_from}"
f"to {best_epoch_loss} in {epochs} epochs"
)

return models


def classifier(df, config, header, dataset_features, validation_df=None, process_id=None):
def classifier(
df: pd.DataFrame,
config: dict,
header: str,
dataset_features: dict,
validation_df: Optional[pd.DataFrame] = None,
process_id: Optional[int] = None,
silent: bool = False,
) -> tuple:
models = []

logger.info("gradient boosting for classification")
if silent is False:
logger.info("gradient boosting for classification")

epochs = config["epochs"]
enableParallelism = config["enableParallelism"]
@@ -182,7 +199,7 @@ def classifier(df, config, header, dataset_features, validation_df=None, process

boosted_predictions = np.zeros([df.shape[0], len(classes)])

pbar = tqdm(range(0, epochs), desc="Boosting")
pbar = tqdm(range(0, epochs), desc="Boosting", disable=silent)

# store actual set, we will use this to calculate loss
actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes)
@@ -317,9 +334,11 @@ def classifier(df, config, header, dataset_features, validation_df=None, process

# --------------------------------

logger.info(
f"The best accuracy got in {best_accuracy_idx} epoch with the score {best_accuracy_value}"
)
if silent is False:
logger.info(
f"The best accuracy got in {best_accuracy_idx} epoch"
f" with the score {best_accuracy_value}"
)

models = models[0 : best_accuracy_idx * len(classes) + len(classes)]

33 changes: 28 additions & 5 deletions chefboost/tuning/randomforest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Optional
import multiprocessing
from contextlib import closing

from tqdm import tqdm
import pandas as pd

from chefboost.commons import functions
from chefboost.training import Training
@@ -10,7 +12,15 @@
# pylint: disable=unused-argument


def apply(df, config, header, dataset_features, validation_df=None, process_id=None):
def apply(
df: pd.DataFrame,
config: dict,
header: str,
dataset_features: dict,
validation_df: Optional[pd.DataFrame] = None,
process_id: Optional[int] = None,
silent: bool = False,
):
models = []

num_of_trees = config["num_of_trees"]
@@ -24,9 +34,10 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N

input_params = []

pbar = tqdm(range(0, num_of_trees), desc="Bagging")
pbar = tqdm(range(0, num_of_trees), desc="Bagging", disable=silent)
for i in pbar:
pbar.set_description(f"Sub decision tree {i + 1} is processing")
if silent is False:
pbar.set_description(f"Sub decision tree {i + 1} is processing")
subset = df.sample(frac=1 / num_of_trees)

root = 1
@@ -38,7 +49,19 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N

if parallelism_on: # parallel run
input_params.append(
(subset, root, file, config, dataset_features, 0, 0, "root", i, None, process_id)
(
subset,
root,
file,
config,
dataset_features,
0,
0,
"root",
i,
None,
process_id,
)
)

else: # serial run
@@ -75,7 +98,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N

# all functions registered here
# results = []
for f in tqdm(funclist):
for f in tqdm(funclist, disable=silent):
_ = f.get(timeout=100000) # this was branch_results
# results.append(branch_results)

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
339 changes: 0 additions & 339 deletions tests/global-unit-test.py

This file was deleted.

27 changes: 27 additions & 0 deletions tests/test_adaboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_adaboost.py")


def test_adaboost():
config = {
"algorithm": "Regression",
"enableAdaboost": True,
"num_of_weak_classifier": 10,
"enableParallelism": False,
}
df = pd.read_csv("dataset/adaboost.txt")
validation_df = df.copy()

model = cb.fit(df, config, validation_df=validation_df, silent=True)

instance = [4, 3.5]

prediction = cb.predict(model, instance)

assert prediction == -1
assert len(model["trees"]) > 1

logger.info("✅ adaboost model restoration test done")
24 changes: 24 additions & 0 deletions tests/test_c45.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_c45.py")


def test_c45_for_nominal_features_and_nominal_target():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True)
assert model["config"]["algorithm"] == "C4.5"
logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done")

def test_c45_for_nominal_and_numeric_features_and_nominal_target():
df = pd.read_csv("dataset/golf2.txt")
model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True)
assert model["config"]["algorithm"] == "C4.5"
logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done")

def test_large_dataset():
df = pd.read_csv("dataset/car.data")
model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True)
assert model["config"]["algorithm"] == "C4.5"
logger.info("✅ build c4.5 for large dataset test done")
25 changes: 25 additions & 0 deletions tests/test_cart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_cart.py")


def test_cart_for_nominal_features_and_nominal_target():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, config={"algorithm": "CART"}, silent=True)
assert model["config"]["algorithm"] == "CART"
logger.info("✅ build cart for nominal and numeric features and nominal target test done")


def test_cart_for_nominal_and_numeric_features_and_nominal_target():
df = pd.read_csv("dataset/golf2.txt")
model = cb.fit(df, config={"algorithm": "CART"}, silent=True)
assert model["config"]["algorithm"] == "CART"
logger.info("✅ build cart for nominal and numeric features and nominal target test done")

def test_large_dataset():
df = pd.read_csv("dataset/car.data")
model = cb.fit(df, config={"algorithm": "CART"}, silent=True)
assert model["config"]["algorithm"] == "CART"
logger.info("✅ build c4.5 for large dataset test done")
26 changes: 26 additions & 0 deletions tests/test_chaid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_c45.py")


def test_c45_for_nominal_features_and_nominal_target():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True)
assert model["config"]["algorithm"] == "CHAID"
logger.info("✅ build chaid for nominal features and nominal target test done")


def test_c45_for_nominal_and_numeric_features_and_nominal_target():
df = pd.read_csv("dataset/golf2.txt")
model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True)
assert model["config"]["algorithm"] == "CHAID"
logger.info("✅ build chaid for nominal and numeric features and nominal target test done")


def test_large_dataset():
df = pd.read_csv("dataset/car.data")
model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True)
assert model["config"]["algorithm"] == "CHAID"
logger.info("✅ build c4.5 for large dataset test done")
48 changes: 48 additions & 0 deletions tests/test_gbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_gbm.py")


def test_gbm_regression():
config = {
"algorithm": "Regression",
"enableGBM": True,
"epochs": 10,
"learning_rate": 1,
}

df = pd.read_csv("dataset/golf4.txt")
validation_df = pd.read_csv("dataset/golf4.txt")

model = cb.fit(df, config, validation_df=validation_df, silent=True)
assert model["config"]["algorithm"] == "Regression"
assert len(model["trees"]) > 1

features = ["Sunny", 85, 85, "Weak"]
target = 25
prediction = cb.predict(model, features)
assert abs(prediction - target) < 1


def test_gbm_classification():
config = {
"algorithm": "ID3",
"enableGBM": True,
"epochs": 10,
"learning_rate": 1,
}

df = pd.read_csv(
"dataset/iris.data",
names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"],
)
validation_df = df.copy()

model = cb.fit(df, config, validation_df=validation_df, silent=True)

instance = [7.0, 3.2, 4.7, 1.4]
target = "Iris-versicolor"
prediction = cb.predict(model, instance)
assert prediction == target
114 changes: 114 additions & 0 deletions tests/test_id3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_id3.py")


def test_build_id3_with_no_config():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, silent=True)
assert model["config"]["algorithm"] == "ID3"
logger.info("✅ standard id3 test done")


def test_build_id3_with_internal_validation_df():
df = pd.read_csv("dataset/golf.txt")
validation_df = pd.read_csv("dataset/golf.txt")

model = cb.fit(df, validation_df=validation_df, silent=True)

assert model["config"]["algorithm"] == "ID3"

validation_eval_results = model["evaluation"]["validation"]

assert validation_eval_results.get("Accuracy", 0) > 99
assert validation_eval_results.get("Precision", 0) > 99
assert validation_eval_results.get("Recall", 0) > 99
assert validation_eval_results.get("F1", 0) > 99
assert validation_eval_results.get("Instances", 0) == validation_df.shape[0]
assert "Confusion matrix" in validation_eval_results.keys()
assert "Labels" in validation_eval_results.keys()

# decision_rules = model["trees"][0].__dict__["__name__"]+".py"
decision_rules = model["trees"][0].__dict__["__spec__"].origin

fi_df = cb.feature_importance(decision_rules, silent=True)
assert fi_df.shape[0] == 4

logger.info("✅ id3 test with internal validation data frame done")


def test_build_id3_with_external_validation_set():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, silent=True)

assert model["config"]["algorithm"] == "ID3"

validation_df = pd.read_csv("dataset/golf.txt")
results = cb.evaluate(model, validation_df, silent=True)

assert results.get("Accuracy", 0) > 99
assert results.get("Precision", 0) > 99
assert results.get("Recall", 0) > 99
assert results.get("F1", 0) > 99
assert results.get("Instances", 0) == validation_df.shape[0]
assert "Confusion matrix" in results.keys()
assert "Labels" in results.keys()

logger.info("✅ id3 test with external validation data frame done")


def test_model_restoration():
df = pd.read_csv("dataset/golf.txt")
model = cb.fit(df, silent=True)
assert model["config"]["algorithm"] == "ID3"

cb.save_model(model)

restored_model = cb.load_model("model.pkl")

assert restored_model["config"]["algorithm"] == "ID3"

instance = ["Sunny", "Hot", "High", "Weak"]

prediction = cb.predict(restored_model, instance)
assert prediction == "No"

logger.info("✅ id3 model restoration test done")


def test_build_id3_for_nominal_and_numeric_features_nominal_target():
df = pd.read_csv("dataset/golf2.txt")
model = cb.fit(df, silent=True)

assert model["config"]["algorithm"] == "ID3"

instance = ["Sunny", 85, 85, "Weak"]
prediction = cb.predict(model, instance)
assert prediction == "No"
logger.info("✅ build id3 for nominal and numeric features and nominal target test done")


def test_large_data_set():
df = pd.read_csv("dataset/car.data")
model = cb.fit(df, silent=True)

assert model["config"]["algorithm"] == "ID3"

instance = ["vhigh", "vhigh", 2, "2", "small", "low"]
prediction = cb.predict(model, instance)
assert prediction == "unacc"

instance = ["high", "high", "4", "more", "big", "high"]
prediction = cb.predict(model, instance)
assert prediction == "acc"


def test_iris_dataset():
df = pd.read_csv(
"dataset/iris.data",
names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"],
)
model = cb.fit(df, silent=True)
assert model["config"]["algorithm"] == "ID3"
55 changes: 55 additions & 0 deletions tests/test_randomforest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_randomforest.py")


def test_randomforest_for_classification():
config = {
"algorithm": "ID3",
"enableRandomForest": True,
"num_of_trees": 3,
}
df = pd.read_csv("dataset/car.data")

model = cb.fit(df, config, silent=True)

assert model["config"]["algorithm"] == "ID3"
assert model["evaluation"]["train"]["Accuracy"] > 90

# feature importance
decision_rules = []
for tree in model["trees"]:
decision_rule = tree.__dict__["__spec__"].origin
decision_rules.append(decision_rule)

df = cb.feature_importance(decision_rules, silent=True)
assert df.shape[0] == 6

# this is not in train data
instance = ["high", "high", 4, "more", "big", "high"]
prediction = cb.predict(model, instance)
assert prediction in ["unacc", "acc"]

instance = ["vhigh", "vhigh", 2, "2", "small", "low"]
prediction = cb.predict(model, instance)
assert prediction in ["unacc", "acc"]


def test_randomforest_for_regression():
config = {
"algorithm": "ID3",
"enableRandomForest": True,
"num_of_trees": 5,
}
df = pd.read_csv("dataset/car_reg.data")
model = cb.fit(df, config, silent=True)

assert model["evaluation"]["train"]["MAE"] < 10
assert model["config"]["algorithm"] == "Regression"

instance = ["high", "high", 4, "more", "big", "high"]
target = 100
prediction = cb.predict(model, instance)
assert abs(prediction - target) < 30
27 changes: 27 additions & 0 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
from chefboost import Chefboost as cb
from chefboost.commons.logger import Logger

logger = Logger(module="tests/test_regression.py")


def test_c45_for_nominal_features_and_numeric_target():
df = pd.read_csv("dataset/golf3.txt")
_ = cb.fit(df, config={"algorithm": "Regression"}, silent=True)
logger.info("✅ build regression for nominal features and numeric target test done")


def test_c45_for_nominal_and_numeric_features_and_numeric_target():
df = pd.read_csv("dataset/golf4.txt")
_ = cb.fit(df, config={"algorithm": "Regression"}, silent=True)
logger.info(
"✅ build regression tree for nominal and numeric features and numeric target test done"
)


def test_switching_to_regression_tree():
df = pd.read_csv("dataset/golf4.txt")
config = {"algorithm": "ID3"}
model = cb.fit(df, config, silent=True)
assert model["config"]["algorithm"] == "Regression"
logger.info("✅ switching to regression tree test done")

0 comments on commit 33c0b2c

Please sign in to comment.