Skip to content

Commit 02991d5

Browse files
authored
Merge branch 'refactor_development' into refactor_basetask
2 parents 9481437 + 5fef094 commit 02991d5

File tree

8 files changed

+175
-35
lines changed

8 files changed

+175
-35
lines changed

autoPyTorch/evaluation/abstract_evaluator.py

+24-10
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ def __init__(self, config: str,
8484
configuration_space = self.pipeline.get_hyperparameter_search_space()
8585
default_configuration = configuration_space.get_default_configuration().get_dictionary()
8686
default_configuration['model_trainer:tabular_classifier:classifier'] = config
87-
configuration = Configuration(configuration_space, default_configuration)
88-
self.pipeline.set_hyperparameters(configuration)
87+
self.configuration = Configuration(configuration_space, default_configuration)
88+
self.pipeline.set_hyperparameters(self.configuration)
8989

9090
def fit(self, X: Dict[str, Any], y: Any,
9191
sample_weight: Optional[np.ndarray] = None) -> object:
@@ -102,8 +102,18 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
102102
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
103103
return False
104104

105-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
106-
return None
105+
def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201
106+
"""
107+
Can be used to return additional info for the run.
108+
Returns:
109+
Dict[str, Any]:
110+
Currently contains
111+
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
112+
2. trainer_configuration: the parameters for the traditional model used.
113+
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs
114+
"""
115+
return {'pipeline_configuration': self.configuration,
116+
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}
107117

108118
def get_pipeline_representation(self) -> Dict[str, str]:
109119
return self.pipeline.get_pipeline_representation()
@@ -134,7 +144,9 @@ def __init__(self, config: Configuration,
134144
random_state: Optional[Union[int, np.random.RandomState]] = None,
135145
init_params: Optional[Dict] = None
136146
) -> None:
137-
self.configuration = config
147+
self.config = config
148+
self.init_params = init_params
149+
self.random_state = random_state
138150
if config == 1:
139151
super(DummyClassificationPipeline, self).__init__(strategy="uniform")
140152
else:
@@ -163,8 +175,8 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
163175
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
164176
return False
165177

166-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
167-
return None
178+
def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
179+
return {}
168180

169181
def get_pipeline_representation(self) -> Dict[str, str]:
170182
return {
@@ -198,7 +210,9 @@ class DummyRegressionPipeline(DummyRegressor):
198210
def __init__(self, config: Configuration,
199211
random_state: Optional[Union[int, np.random.RandomState]] = None,
200212
init_params: Optional[Dict] = None) -> None:
201-
self.configuration = config
213+
self.config = config
214+
self.init_params = init_params
215+
self.random_state = random_state
202216
if config == 1:
203217
super(DummyRegressionPipeline, self).__init__(strategy='mean')
204218
else:
@@ -219,8 +233,8 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
219233
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
220234
return False
221235

222-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
223-
return None
236+
def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
237+
return {}
224238

225239
@staticmethod
226240
def get_default_pipeline_options() -> Dict[str, Any]:

autoPyTorch/evaluation/tae.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def fit_predict_try_except_decorator(
5757
def get_cost_of_crash(metric: autoPyTorchMetric) -> float:
5858
# The metric must always be defined to extract optimum/worst
5959
if not isinstance(metric, autoPyTorchMetric):
60-
raise ValueError("The metric must be stricly be an instance of autoPyTorchMetric")
60+
raise ValueError("The metric must be strictly be an instance of autoPyTorchMetric")
6161

6262
# Autopytorch optimizes the err. This function translates
6363
# worst_possible_result to be a minimization problem.

autoPyTorch/evaluation/train_evaluator.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ def fit_predict_and_loss(self) -> None:
143143
# weights for opt_losses.
144144
opt_fold_weights = [np.NaN] * self.num_folds
145145

146+
additional_run_info = {}
147+
146148
for i, (train_split, test_split) in enumerate(self.splits):
147149

148150
pipeline = self.pipelines[i]
@@ -178,7 +180,8 @@ def fit_predict_and_loss(self) -> None:
178180
# number of optimization data points for this fold.
179181
# Used for weighting the average.
180182
opt_fold_weights[i] = len(train_split)
181-
183+
additional_run_info.update(pipeline.get_additional_run_info() if hasattr(
184+
pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {})
182185
# Compute weights of each fold based on the number of samples in each
183186
# fold.
184187
train_fold_weights = [w / sum(train_fold_weights)

autoPyTorch/utils/backend.py

+45
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ def __init__(self, context: BackendContext):
169169
self._logger = None # type: Optional[PicklableClientLogger]
170170
self.context = context
171171

172+
# Track the number of configurations launched
173+
# num_run == 1 means a dummy estimator run
174+
self.active_num_run = 1
175+
172176
# Create the temporary directory if it does not yet exist
173177
try:
174178
os.makedirs(self.temporary_directory)
@@ -329,6 +333,47 @@ def get_runs_directory(self) -> str:
329333
def get_numrun_directory(self, seed: int, num_run: int, budget: float) -> str:
330334
return os.path.join(self.internals_directory, 'runs', '%d_%d_%s' % (seed, num_run, budget))
331335

336+
def get_next_num_run(self, peek: bool = False) -> int:
337+
"""
338+
Every pipeline that is fitted by the estimator is stored with an
339+
identifier called num_run. A dummy classifier will always have a num_run
340+
equal to 1, and all other new configurations that are explored will
341+
have a sequentially increasing identifier.
342+
343+
This method returns the next num_run a configuration should take.
344+
345+
Parameters
346+
----------
347+
peek: bool
348+
By default, the next num_rum will be returned, i.e. self.active_num_run + 1
349+
Yet, if this bool parameter is equal to True, the value of the current
350+
num_run is provided, i.e, self.active_num_run.
351+
In other words, peek allows to get the current maximum identifier
352+
of a configuration.
353+
354+
Returns
355+
-------
356+
num_run: int
357+
An unique identifier for a configuration
358+
"""
359+
360+
# If there are other num_runs, their name would be runs/<seed>_<num_run>_<budget>
361+
other_num_runs = [int(os.path.basename(run_dir).split('_')[1])
362+
for run_dir in glob.glob(os.path.join(self.internals_directory,
363+
'runs',
364+
'*'))]
365+
if len(other_num_runs) > 0:
366+
# We track the number of runs from two forefronts:
367+
# The physically available num_runs (which might be deleted or a crash could happen)
368+
# From a internally kept attribute. The later should be sufficient, but we
369+
# want to be robust against multiple backend copies on different workers
370+
self.active_num_run = max([self.active_num_run] + other_num_runs)
371+
372+
# We are interested in the next run id
373+
if not peek:
374+
self.active_num_run += 1
375+
return self.active_num_run
376+
332377
def get_model_filename(self, seed: int, idx: int, budget: float) -> str:
333378
return '%s.%s.%s.model' % (seed, idx, budget)
334379

test/test_api/test_api.py

+74-19
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212

1313
import sklearn
1414
import sklearn.datasets
15+
from sklearn.base import clone
1516
from sklearn.ensemble import VotingClassifier, VotingRegressor
1617

18+
from smac.runhistory.runhistory import RunHistory
19+
1720
import torch
1821

1922
from autoPyTorch.api.tabular_classification import TabularClassificationTask
@@ -23,6 +26,7 @@
2326
HoldoutValTypes,
2427
)
2528
from autoPyTorch.optimizer.smbo import AutoMLSMBO
29+
from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy
2630

2731

2832
# Fixtures
@@ -104,17 +108,20 @@ def test_tabular_classification(openml_id, resampling_strategy, backend):
104108

105109
# Search for an existing run key in disc. A individual model might have
106110
# a timeout and hence was not written to disc
111+
successful_num_run = None
112+
SUCCESS = False
107113
for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
108-
if 'SUCCESS' not in str(value.status):
109-
continue
110-
111-
run_key_model_run_dir = estimator._backend.get_numrun_directory(
112-
estimator.seed, run_key.config_id + 1, run_key.budget)
113-
if os.path.exists(run_key_model_run_dir):
114-
# Runkey config id is different from the num_run
115-
# more specifically num_run = config_id + 1(dummy)
114+
if 'SUCCESS' in str(value.status):
115+
run_key_model_run_dir = estimator._backend.get_numrun_directory(
116+
estimator.seed, run_key.config_id + 1, run_key.budget)
116117
successful_num_run = run_key.config_id + 1
117-
break
118+
if os.path.exists(run_key_model_run_dir):
119+
# Runkey config id is different from the num_run
120+
# more specifically num_run = config_id + 1(dummy)
121+
SUCCESS = True
122+
break
123+
124+
assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
118125

119126
if resampling_strategy == HoldoutValTypes.holdout_validation:
120127
model_file = os.path.join(run_key_model_run_dir,
@@ -272,17 +279,20 @@ def test_tabular_regression(openml_name, resampling_strategy, backend):
272279

273280
# Search for an existing run key in disc. A individual model might have
274281
# a timeout and hence was not written to disc
282+
successful_num_run = None
283+
SUCCESS = False
275284
for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
276-
if 'SUCCESS' not in str(value.status):
277-
continue
278-
279-
run_key_model_run_dir = estimator._backend.get_numrun_directory(
280-
estimator.seed, run_key.config_id + 1, run_key.budget)
281-
if os.path.exists(run_key_model_run_dir):
282-
# Runkey config id is different from the num_run
283-
# more specifically num_run = config_id + 1(dummy)
285+
if 'SUCCESS' in str(value.status):
286+
run_key_model_run_dir = estimator._backend.get_numrun_directory(
287+
estimator.seed, run_key.config_id + 1, run_key.budget)
284288
successful_num_run = run_key.config_id + 1
285-
break
289+
if os.path.exists(run_key_model_run_dir):
290+
# Runkey config id is different from the num_run
291+
# more specifically num_run = config_id + 1(dummy)
292+
SUCCESS = True
293+
break
294+
295+
assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
286296

287297
if resampling_strategy == HoldoutValTypes.holdout_validation:
288298
model_file = os.path.join(run_key_model_run_dir,
@@ -384,7 +394,7 @@ def test_tabular_input_support(openml_id, backend):
384394
estimator._do_dummy_prediction = unittest.mock.MagicMock()
385395

386396
with unittest.mock.patch.object(AutoMLSMBO, 'run_smbo') as AutoMLSMBOMock:
387-
AutoMLSMBOMock.return_value = ({}, {}, 'epochs')
397+
AutoMLSMBOMock.return_value = (RunHistory(), {}, 'epochs')
388398
estimator.search(
389399
X_train=X_train, y_train=y_train,
390400
X_test=X_test, y_test=y_test,
@@ -394,3 +404,48 @@ def test_tabular_input_support(openml_id, backend):
394404
enable_traditional_pipeline=False,
395405
load_models=False,
396406
)
407+
408+
409+
@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
410+
def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
411+
backend = fit_dictionary_tabular['backend']
412+
estimator = TabularClassificationTask(
413+
backend=backend,
414+
resampling_strategy=HoldoutValTypes.holdout_validation,
415+
ensemble_size=0,
416+
)
417+
418+
# Setup pre-requisites normally set by search()
419+
estimator._create_dask_client()
420+
estimator._metric = accuracy
421+
estimator._logger = estimator._get_logger('test')
422+
estimator._memory_limit = 5000
423+
estimator._time_for_task = 60
424+
estimator._disable_file_output = []
425+
estimator._all_supported_metrics = False
426+
427+
estimator._do_dummy_prediction()
428+
429+
# Ensure that the dummy predictions are not in the current working
430+
# directory, but in the temporary directory.
431+
assert not os.path.exists(os.path.join(os.getcwd(), '.autoPyTorch'))
432+
assert os.path.exists(os.path.join(
433+
backend.temporary_directory, '.autoPyTorch', 'runs', '1_1_1.0',
434+
'predictions_ensemble_1_1_1.0.npy')
435+
)
436+
437+
model_path = os.path.join(backend.temporary_directory,
438+
'.autoPyTorch',
439+
'runs', '1_1_1.0',
440+
'1.1.1.0.model')
441+
442+
# Make sure the dummy model complies with scikit learn
443+
# get/set params
444+
assert os.path.exists(model_path)
445+
with open(model_path, 'rb') as model_handler:
446+
clone(pickle.load(model_handler))
447+
448+
estimator._close_dask_client()
449+
estimator._clean_logger()
450+
451+
del estimator

test/test_evaluation/test_train_evaluator.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ def __init__(self):
5050
def predict_proba(self, X, batch_size=None):
5151
return np.tile([0.6, 0.4], (len(X), 1))
5252

53-
def get_additional_run_info(self) -> None:
54-
return None
53+
def get_additional_run_info(self):
54+
return {}
5555

5656

5757
class TestTrainEvaluator(BaseEvaluatorTest, unittest.TestCase):

test/test_pipeline/test_tabular_classification.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -439,5 +439,5 @@ def test_constant_pipeline_iris(fit_dictionary_tabular):
439439
val_score = run_summary.performance_tracker['val_metrics'][epoch_where_best]['balanced_accuracy']
440440
train_score = run_summary.performance_tracker['train_metrics'][epoch_where_best]['balanced_accuracy']
441441

442-
assert val_score >= 0.9, run_summary.performance_tracker['val_metrics']
443-
assert train_score >= 0.9, run_summary.performance_tracker['train_metrics']
442+
assert val_score >= 0.8, run_summary.performance_tracker['val_metrics']
443+
assert train_score >= 0.8, run_summary.performance_tracker['train_metrics']

test/test_utils/test_backend.py

+23
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
# -*- encoding: utf-8 -*-
22
import builtins
3+
import logging.handlers
34
import unittest
45
import unittest.mock
56

7+
import numpy as np
8+
69
import pytest
710

811
from autoPyTorch.utils.backend import Backend
@@ -81,3 +84,23 @@ def test_loads_models_by_identifiers(exists_mock, openMock, pickleLoadMock, back
8184

8285
assert isinstance(actual_dict, dict)
8386
assert expected_dict == actual_dict
87+
88+
89+
def test_get_next_num_run(backend):
90+
# Asking for a num_run increases the tracked num_run
91+
assert backend.get_next_num_run() == 2
92+
assert backend.get_next_num_run() == 3
93+
# Then test that we are robust against new files being generated
94+
backend.setup_logger('Test', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
95+
backend.save_numrun_to_dir(
96+
seed=0,
97+
idx=12,
98+
budget=0.0,
99+
model=dict(),
100+
cv_model=None,
101+
ensemble_predictions=np.zeros(10),
102+
valid_predictions=None,
103+
test_predictions=None,
104+
)
105+
assert backend.get_next_num_run() == 13
106+
assert backend.get_next_num_run(peek=True) == 13

0 commit comments

Comments
 (0)