Source code for raredecay.meta_config

# -*- coding: utf-8 -*-
"""
Created on Fri Apr  1 15:32:17 2016

@author: Jonas Eschle "Mayou36"

| This module provides the meta-configuration.
| Mostly, you do not need to change this file or only some small parts of it.
| Things you may want to change include whether to
    - promt for a addition to the file name
    - number of cores to use
    - path for pickle-files
    - default configuration for classifiers, figure saving and more
| It contains furthermore:
 - (package-)global default variables for all modules.
 - Debug-options which change some implementation on a basic level like protocols.
 - Global configurations like the endings of specific file-types etc.

The explanation to the variables is available as a comment behind each.

Variables:
---------
run_config:
    It provides the right config module depending on what was chosen
    in the run-methods.
    Should not be changed during the run, only once in the begining.
SUPPRESS_WRONG_SKLEARN_VERSION:
    This package was built for sklearn 0.17. With 0.18 there are some
    module-name changes, which can cause a crash of the program.
"""

# ==============================================================================
# DO NOT IMPORT ANY PACKAGE (run configuration) DEPENDENCY!
# ==============================================================================
from __future__ import division, absolute_import

import cPickle as pickle
import multiprocessing
import random


# ==============================================================================
# Parameters which can be changed WITHOUT affecting stability of a single run.
# Be aware: certain tasks like loading  a pickled file may fail if the file-
# endings are changed.
# ==============================================================================

# ------------------------------------------------------------------------------
# General run parameters
# ------------------------------------------------------------------------------

PROMPT_FOR_COMMENT = False  # let you add an extension to the run/file name
MULTITHREAD = True  # if False, no parallel work will be done
MULTIPROCESSING = True  # requires MULTITHREAD to be true, else it's False
n_cpu_max = 1  # VAGUE ESTIMATION but not a strict limit. If None, number of cores will be assigned
use_gpu = False  # If True, optimisation for GPU use is done (e.g. nn not parallel on cpu).
# This does NOT use the GPU yet, but "not use the cpu" where the GPU will be invoked
use_stratified_folding = True  # StratifiedKFolding is better, from a statistical point of view,
# but also needs more memory, mostly insignificantly but can be large


[docs]def get_n_cpu(n_cpu=None): """Return the number of cpus to use. None means all. Can be -1, -2...""" if n_cpu is None: n_cpu = 1 if isinstance(n_cpu, int): if n_cpu < 0: n_cpu = max([n_cpu_max + n_cpu + 1, 1]) # n_cpu = min([n_cpu, n_cpu_max]) return n_cpu
# set meta-config variables
[docs]def set_parallel_profile(n_cpu=-1, gpu_in_use=False, stratified_kfolding=True): """Set the number of cpus and whether a gpu is in use or not.""" global MULTIPROCESSING, MULTITHREAD, n_cpu_max, use_gpu, use_stratified_folding use_stratified_folding = stratified_kfolding MULTIPROCESSING = MULTITHREAD = True if n_cpu == 1: n_cpu_max = 1 elif n_cpu is None: pass elif isinstance(n_cpu, int): if n_cpu > 1: n_cpu_max = n_cpu elif n_cpu < 0: n_cpu_max = max([multiprocessing.cpu_count() + n_cpu + 1, 1]) # -1 is "all cpus" else: raise ValueError("Invalid n_cpu argument: " + str(n_cpu)) else: raise TypeError("Wrong n_cpu argument, type: " + str(type(n_cpu)) + " not allowed") use_gpu = gpu_in_use if gpu_in_use is not None else use_gpu
# ------------------------------------------------------------------------------ # Datatype ending variables # ------------------------------------------------------------------------------ # The ending of a certain variable type. Change with caution and good reason. PICKLE_DATATYPE = "pickle" # default: 'pickle' ROOT_DATATYPE = "root" # default 'root' # ------------------------------------------------------------------------------ # SHARED OBJECT PATHES INPUT & OUTPUT # ------------------------------------------------------------------------------ # folder where the pickled objects are stored PICKLE_PATH = '/home/mayou/Documents/uniphysik/Bachelor_thesis/analysis/pickle/' # folder where the git-directory is located. Can be an empty string GIT_DIR_PATH = "/home/mayou/Documents/uniphysik/Bachelor_thesis/" + \ "python_workspace/raredecay/raredecay" # ------------------------------------------------------------------------------ # Debug related options # ------------------------------------------------------------------------------ # This options should not directly affect the behaviour (except of speed etc) # IF the right environment is used. Don't touch until you have good reasons to do. PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL # default: pickle.HIGHEST_PROTOCOL SUPPRESS_WRONG_SKLEARN_VERSION = False # Should NOT BE CHANGED. # ============================================================================== # Parameters which may affect stability # setting for example MAX_AUTO_FOLDERS to 0, it will surely not work # ============================================================================== # ------------------------------------------------------------------------------ # Limits for auto-methods # ------------------------------------------------------------------------------ # If a folder already exists and no overwrite is in use, a new folder (with a # trailing number) will be created. There can be set a limit to prevent a full # disk in case of an endless loop-error or similar. MAX_AUTO_FOLDERS = 10000 # max number of auto-generated folders by initialize NO_PROMPT_ASSUME_YES = True # no userinput required, assumes yes (e.g. when overwritting files) MAX_ERROR_COUNT = 1000 # set a maximum number of possible errors (not able to save figure etc.) # Criticals will end the run anyway. MAX_FIGURES = 1000 # max number of figures to be plotted # ============================================================================== # DEFAULT SETTINGS for different things # ============================================================================== # ------------------------------------------------------------------------------ # Output and plot configurations # ------------------------------------------------------------------------------ # available output folders. Do NOT CHANGE THE KEYS as modules depend on them! # You may add additional key-value pairs or just change some values # The name of the folders created inside the run-folder DEFAULT_OUTPUT_FOLDERS = dict( log="log", # contains the logger informations plots="plots", # contains all the plots results="results", # contains the written output config="config" # NOT YET IMPLEMENTED, but cound contain the config file used ) # The default histogram settings used for some plots DEFAULT_HIST_SETTINGS = dict( bins=40, # default: 40 normed=True, # default: True, useful for shape comparison of distributions alpha=0.5, # transparency [0.0, 1.0] histtype='stepfilled' ) # Default configuration for most of the figures for save_fig from OutputHandler() DEFAULT_SAVE_FIG = dict( file_format=['png', 'pdf'], # default: ['png', 'svg'], the file formats dpi=150, # to be saved to. For implementations, see OutputHandler() to_pickle=True, # whether to pickle the plot (and therefore be able to replot) # save_cfg=None ) # Default configuration for additional figures (plots you mostly do not care # about but may be happy to have them saved somewhere) DEFAULT_EXT_SAVE_FIG = dict( file_format=['png', 'pdf'], to_pickle=True # save_cfg=None ) # A logger writes some stuff during the run just for the control of the # correct execution. The log will be written to console, to file, or both. # Each message has a level ranging from the lowest (most unimportant) 'debug' # to 'critical'. You can specify which level (+ the more important one) will # appear where. # Example: you can set console to 'error'. and file to 'info'. This way you # collect also seemingly unneccesary informations (which are maybe later nice # to check if a variable was meaningful) but on the screen you will only see # if an error or critical occurs. DEFAULT_LOGGER_CFG = dict( logging_mode='console', # define where the logger is written to # take 'both', 'file', 'console' or 'no' log_level_file='debug', # 'debug', 'info', warning', 'error', 'critical' # specifies the level to be logged to the file log_level_console='debug', # 'debug', 'info', warning', 'error', 'critical' # specify the level to be logged to the console overwrite_file=True, # specifies whether it should overwrite the log file each time # or instead make a new one each run log_file_name='AAlastRun', # the beginning ofthe name of the logfile, like 'project1' log_file_dir=DEFAULT_OUTPUT_FOLDERS.get('log') ) # ------------------------------------------------------------------------------ # Classifier configurations # ------------------------------------------------------------------------------ # Some modules use classifiers for different tasks where it is mostly not # important to have a fully optimized classifier but just a "good enough" one. # Like in the data_ROC where you can see how well two datasets differ from # each other. # Changing this default values will surely affect your results (over- or # underfitting for example), but is mostly not required at all. DEFAULT_CLF_XGB = dict( n_estimators=150, # default 75 eta=0.1, # default 0.1, learning-rate min_child_weight=0, # #0 stage 2 to optimize max_depth=5, # #6 stage 2 to optimize gamma=0.1, # stage 3, minimum loss-reduction required to make a split. # Higher value-> more conservative subsample=0.8, # stage 4, subsample of data. 1 means all data, 0.7 means only 70% of data # for a tree colsample=1 ) DEFAULT_CLF_TMVA = dict( method='kBDT' ) DEFAULT_CLF_RDF = dict( n_estimators=150, max_features=None, # max_depth=100 ) DEFAULT_CLF_GB = dict( n_estimators=200, learning_rate=0.15, max_depth=5, subsample=0.9, max_features=None ) DEFAULT_CLF_ADA = dict( n_estimators=200, learning_rate=0.2 ) DEFAULT_CLF_NN = dict( layers=[500, 500, 500], hidden_activation='logistic', output_activation='linear', input_noise=0, # [0,1,2,3,4,5,10,20], hidden_noise=0, input_dropout=0, hidden_dropout=0.03, decode_from=1, weight_l1=0.01, weight_l2=0.01, scaler='standard', trainers=[{'optimize': 'adagrad', 'patience': 10, 'learning_rate': 0.1, 'min_improvement': 0.01, 'momentum': 0.5, 'nesterov': True, 'loss': 'xe'}], ) DEFAULT_CLF_KNN = dict( n_neigh=5 ) # default clf config collection DEFAULT_CLF_CONFIG = dict( xgb=DEFAULT_CLF_XGB, tmva=DEFAULT_CLF_TMVA, gb=DEFAULT_CLF_GB, ada=DEFAULT_CLF_ADA, nn=DEFAULT_CLF_NN, knn=DEFAULT_CLF_KNN, rdf=DEFAULT_CLF_RDF ) # default clf names collection DEFAULT_CLF_NAME = dict( xgb='XGBoost clf', tmva='TMVA clf', gb='Gradient Boosted Trees clf', ada='AdaBoost over Trees clf', nn='Theanets Neural Network clf', knn='K-Nearest Neighbour clf', rdf='Random Forest clf' ) # ------------------------------------------------------------------------------ # Hyper parameter optimization # ------------------------------------------------------------------------------ # The backwards feature selection uses first all features and determines the ROC AUC. # Then it removes one feature at a time, the one which yields the smallest difference # to the 'all_features' roc auc is then removed. This continues until the smallest # score difference is bigger then max_difference_feature_selection. max_difference_feature_selection = 0.08 # the biggest score difference to 'all features' # allowed in auc when removing features DEFAULT_HYPER_GENERATOR = 'subgrid' # The default cenerater for the hyperspace search # ============================================================================== # END OF CONFIGURABLE PARAMETERS - DO NOT CHANGE WHAT IS BELOW # ============================================================================== # DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE # DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE # DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE # ============================================================================== # START INTERNAL CONFIGURATION - DO NOT CHANGE # ============================================================================== run_config = "raredecay.run_config.config" # manipulated by OutputHandler() loggers = {} verbosity = 4 plot_verbosity = 3
[docs]def set_verbosity(new_verbosity): """Set the verbosity.""" global verbosity verbosity = round(new_verbosity) _check_verbosity(verbosity)
[docs]def set_plot_verbosity(new_plot_verbosity): """Set the plot verbosity.""" global plot_verbosity plot_verbosity = round(new_plot_verbosity) _check_verbosity(plot_verbosity)
def _check_verbosity(verbosity): if verbosity not in range(-1, 7): raise ValueError("Verbosity has to be int {0, 1, 2, 3, 4, 5}") # ============================================================================== # Random integer generator for pseudo random generator (or other things) # ============================================================================== rand_seed = random.randint(123, 1512412) # 357422 or 566575 random.seed(rand_seed)
[docs]def randint(): """Return random integer.""" return random.randint(51, 523753)
[docs]def randfloat(): """Return a random float between 0 and 1.""" return random.random()
[docs]def set_seed(seed): """Set the global random seed.""" global rand_seed rand_seed = seed random.seed(rand_seed)
# ------------------------------------------------------------------------------ # parallel profile # ------------------------------------------------------------------------------ # ============================================================================== # ERROR HANDLING # ============================================================================== _error_count = 0 # increases if an error happens _warning_count = 0 # increases if an error happens
[docs]def error_occured(max_error_count=MAX_ERROR_COUNT): """Call this function every time a non-critical error (saving etc) occurs.""" global _error_count _error_count += 1 if _error_count >= max_error_count: raise RuntimeError("Too many errors encountered from different sources")
[docs]def warning_occured(): """Call this function every time a warning occurs.""" global _warning_count _warning_count += 1
if __name__ == '__main__': pass