Source code for raredecay.meta_config

# -*- coding: utf-8 -*-
"""
Created on Fri Apr  1 15:32:17 2016

@author: Jonas Eschle "Mayou36"

| This module provides the meta-configuration.
| Mostly, you do not need to change this file or only some small parts of it.
| Things you may want to change include whether to
    - promt for a addition to the file name
    - number of cores to use
    - path for pickle-files
    - default configuration for classifiers, figure saving and more
| It contains furthermore:
 - (package-)global default variables for all modules.
 - Debug-options which change some implementation on a basic level like protocols.
 - Global configurations like the endings of specific file-types etc.

The explanation to the variables is available as a comment behind each.

Variables:
---------
run_config:
    It provides the right config module depending on what was chosen
    in the run-methods.
    Should not be changed during the run, only once in the begining.
SUPPRESS_WRONG_SKLEARN_VERSION:
    This package was built for sklearn 0.17. With 0.18 there are some
    module-name changes, which can cause a crash of the program.
"""

# ==============================================================================
# DO NOT IMPORT ANY PACKAGE (run configuration) DEPENDENCY!
# ==============================================================================
from __future__ import division, absolute_import

import cPickle as pickle
import multiprocessing
import random


# ==============================================================================
# Parameters which can be changed WITHOUT affecting stability of a single run.
# Be aware: certain tasks like loading  a pickled file may fail if the file-
# endings are changed.
# ==============================================================================

# ------------------------------------------------------------------------------
# General run parameters
# ------------------------------------------------------------------------------

PROMPT_FOR_COMMENT = False  # let you add an extension to the run/file name
MULTITHREAD = True  # if False, no parallel work will be done
MULTIPROCESSING = True  # requires MULTITHREAD to be true, else it's False
n_cpu_max = 1  # VAGUE ESTIMATION but not a strict limit. If None, number of cores will be assigned
use_gpu = False  # If True, optimisation for GPU use is done (e.g. nn not parallel on cpu).
# This does NOT use the GPU yet, but "not use the cpu" where the GPU will be invoked
use_stratified_folding = True  # StratifiedKFolding is better, from a statistical point of view,
# but also needs more memory, mostly insignificantly but can be large


[docs]def get_n_cpu(n_cpu=None):
    """Return the number of cpus to use. None means all. Can be -1, -2..."""
    if n_cpu is None:
        n_cpu = 1
    if isinstance(n_cpu, int):
        if n_cpu < 0:
            n_cpu = max([n_cpu_max + n_cpu + 1, 1])  #
        n_cpu = min([n_cpu, n_cpu_max])
    return n_cpu


# set meta-config variables
[docs]def set_parallel_profile(n_cpu=-1, gpu_in_use=False, stratified_kfolding=True):
    """Set the number of cpus and whether a gpu is in use or not."""
    global MULTIPROCESSING, MULTITHREAD, n_cpu_max, use_gpu, use_stratified_folding
    use_stratified_folding = stratified_kfolding
    MULTIPROCESSING = MULTITHREAD = True
    if n_cpu == 1:
        n_cpu_max = 1
    elif n_cpu is None:
        pass
    elif isinstance(n_cpu, int):
        if n_cpu > 1:
            n_cpu_max = n_cpu
        elif n_cpu < 0:
            n_cpu_max = max([multiprocessing.cpu_count() + n_cpu + 1, 1])  # -1 is "all cpus"
        else:
            raise ValueError("Invalid n_cpu argument: " + str(n_cpu))
    else:
        raise TypeError("Wrong n_cpu argument, type: " + str(type(n_cpu)) + " not allowed")

    use_gpu = gpu_in_use if gpu_in_use is not None else use_gpu

# ------------------------------------------------------------------------------
#  Datatype ending variables
# ------------------------------------------------------------------------------

# The ending of a certain variable type. Change with caution and good reason.
PICKLE_DATATYPE = "pickle"  # default: 'pickle'
ROOT_DATATYPE = "root"  # default 'root'

# ------------------------------------------------------------------------------
# SHARED OBJECT PATHES INPUT & OUTPUT
# ------------------------------------------------------------------------------

# folder where the pickled objects are stored
PICKLE_PATH = '/home/mayou/Documents/uniphysik/Bachelor_thesis/analysis/pickle/'
# folder where the git-directory is located. Can be an empty string
GIT_DIR_PATH = "/home/mayou/Documents/uniphysik/Bachelor_thesis/" + \
               "python_workspace/raredecay/raredecay"

# ------------------------------------------------------------------------------
#  Debug related options
# ------------------------------------------------------------------------------

# This options should not directly affect the behaviour (except of speed etc)
# IF the right environment is used. Don't touch until you have good reasons to do.
PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL  # default: pickle.HIGHEST_PROTOCOL
SUPPRESS_WRONG_SKLEARN_VERSION = False  # Should NOT BE CHANGED.

# ==============================================================================
# Parameters which may affect stability
# setting for example MAX_AUTO_FOLDERS to 0, it will surely not work
# ==============================================================================
# ------------------------------------------------------------------------------
#  Limits for auto-methods
# ------------------------------------------------------------------------------

# If a folder already exists and no overwrite is in use, a new folder (with a
# trailing number) will be created. There can be set a limit to prevent a full
# disk in case of an endless loop-error or similar.
MAX_AUTO_FOLDERS = 10000  # max number of auto-generated folders by initialize
NO_PROMPT_ASSUME_YES = True  # no userinput required, assumes yes (e.g. when overwritting files)
MAX_ERROR_COUNT = 1000  # set a maximum number of possible errors (not able to save figure etc.)
# Criticals will end the run anyway.
MAX_FIGURES = 1000  # max number of figures to be plotted


# ==============================================================================
# DEFAULT SETTINGS for different things
# ==============================================================================

# ------------------------------------------------------------------------------
#  Output and plot configurations
# ------------------------------------------------------------------------------

# available output folders. Do NOT CHANGE THE KEYS as modules depend on them!
# You may add additional key-value pairs or just change some values

# The name of the folders created inside the run-folder
DEFAULT_OUTPUT_FOLDERS = dict(
    log="log",  # contains the logger informations
    plots="plots",  # contains all the plots
    results="results",  # contains the written output
    config="config"  # NOT YET IMPLEMENTED, but cound contain the config file used
)

# The default histogram settings used for some plots
DEFAULT_HIST_SETTINGS = dict(
    bins=40,  # default: 40
    normed=True,  # default: True, useful for shape comparison of distributions
    alpha=0.5,  # transparency [0.0, 1.0]
    histtype='stepfilled'
)

# Default configuration for most of the figures for save_fig from OutputHandler()
DEFAULT_SAVE_FIG = dict(
    file_format=['png', 'pdf'],  # default: ['png', 'svg'], the file formats
    dpi=150,                     # to be saved to. For implementations, see OutputHandler()
    to_pickle=True,  # whether to pickle the plot (and therefore be able to replot)
    # save_cfg=None
)

# Default configuration for additional figures (plots you mostly do not care
# about but may be happy to have them saved somewhere)
DEFAULT_EXT_SAVE_FIG = dict(
    file_format=['png', 'pdf'],
    to_pickle=True
    # save_cfg=None
)

# A logger writes some stuff during the run just for the control of the
# correct execution. The log will be written to console, to file, or both.
# Each message has a level ranging from the lowest (most unimportant) 'debug'
# to 'critical'. You can specify which level (+ the more important one) will
# appear where.
# Example: you can set console to 'error'. and file to 'info'. This way you
# collect also seemingly unneccesary informations (which are maybe later nice
# to check if a variable was meaningful) but on the screen you will only see
# if an error or critical occurs.
DEFAULT_LOGGER_CFG = dict(
    logging_mode='console',   # define where the logger is written to
    # take 'both', 'file', 'console' or 'no'
    log_level_file='debug',  # 'debug', 'info', warning', 'error', 'critical'
    # specifies the level to be logged to the file
    log_level_console='debug',  # 'debug', 'info', warning', 'error', 'critical'
    # specify the level to be logged to the console
    overwrite_file=True,
    # specifies whether it should overwrite the log file each time
    # or instead make a new one each run
    log_file_name='AAlastRun',
    # the beginning ofthe name of the logfile, like 'project1'
    log_file_dir=DEFAULT_OUTPUT_FOLDERS.get('log')
)

# ------------------------------------------------------------------------------
#  Classifier configurations
# ------------------------------------------------------------------------------

# Some modules use classifiers for different tasks where it is mostly not
# important to have a fully optimized classifier but just a "good enough" one.
# Like in the data_ROC where you can see how well two datasets differ from
# each other.
# Changing this default values will surely affect your results (over- or
# underfitting for example), but is mostly not required at all.
DEFAULT_CLF_XGB = dict(
    n_estimators=150,  # default 75
    eta=0.1,  # default 0.1, learning-rate
    min_child_weight=0,  # #0 stage 2 to optimize
    max_depth=5,  # #6 stage 2 to optimize
    gamma=0.1,  # stage 3, minimum loss-reduction required to make a split.
    # Higher value-> more conservative
    subsample=0.8,  # stage 4, subsample of data. 1 means all data, 0.7 means only 70% of data
    # for a tree
    colsample=1
)

DEFAULT_CLF_TMVA = dict(
    method='kBDT'
)

DEFAULT_CLF_RDF = dict(
    n_estimators=150,
    max_features=None,
    # max_depth=100
)

DEFAULT_CLF_GB = dict(
    n_estimators=200,
    learning_rate=0.15,
    max_depth=5,
    subsample=0.9,
    max_features=None
)

DEFAULT_CLF_ADA = dict(
    n_estimators=200,
    learning_rate=0.2
)

DEFAULT_CLF_NN = dict(
    layers=[500, 500, 500],
    hidden_activation='logistic',
    output_activation='linear',
    input_noise=0,  # [0,1,2,3,4,5,10,20],
    hidden_noise=0,
    input_dropout=0,
    hidden_dropout=0.03,
    decode_from=1,
    weight_l1=0.01,
    weight_l2=0.01,
    scaler='standard',
    trainers=[{'optimize': 'adagrad', 'patience': 10, 'learning_rate': 0.1,
               'min_improvement': 0.01, 'momentum': 0.5, 'nesterov': True, 'loss': 'xe'}],
)

DEFAULT_CLF_KNN = dict(
    n_neigh=5
    )

# default clf config collection
DEFAULT_CLF_CONFIG = dict(
    xgb=DEFAULT_CLF_XGB,
    tmva=DEFAULT_CLF_TMVA,
    gb=DEFAULT_CLF_GB,
    ada=DEFAULT_CLF_ADA,
    nn=DEFAULT_CLF_NN,
    knn=DEFAULT_CLF_KNN,
    rdf=DEFAULT_CLF_RDF
)

# default clf names collection
DEFAULT_CLF_NAME = dict(
    xgb='XGBoost clf',
    tmva='TMVA clf',
    gb='Gradient Boosted Trees clf',
    ada='AdaBoost over Trees clf',
    nn='Theanets Neural Network clf',
    knn='K-Nearest Neighbour clf',
    rdf='Random Forest clf'
)
# ------------------------------------------------------------------------------
#  Hyper parameter optimization
# ------------------------------------------------------------------------------

# The backwards feature selection uses first all features and determines the ROC AUC.
# Then it removes one feature at a time, the one which yields the smallest difference
# to the 'all_features' roc auc is then removed. This continues until the smallest
# score difference is bigger then max_difference_feature_selection.
max_difference_feature_selection = 0.08  # the biggest score difference to 'all features'
# allowed in auc when removing features
DEFAULT_HYPER_GENERATOR = 'subgrid'  # The default cenerater for the hyperspace search

# ==============================================================================
# END OF CONFIGURABLE PARAMETERS - DO NOT CHANGE WHAT IS BELOW
# ==============================================================================

# DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE
# DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE
# DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE DO NOT CROSS THIS LINE


# ==============================================================================
# START INTERNAL CONFIGURATION - DO NOT CHANGE
# ==============================================================================

run_config = "raredecay.run_config.config"  # manipulated by OutputHandler()

loggers = {}

verbosity = 4
plot_verbosity = 3


[docs]def set_verbosity(new_verbosity):
    """Set the verbosity."""
    global verbosity
    verbosity = round(new_verbosity)
    _check_verbosity(verbosity)


[docs]def set_plot_verbosity(new_plot_verbosity):
    """Set the plot verbosity."""
    global plot_verbosity
    plot_verbosity = round(new_plot_verbosity)
    _check_verbosity(plot_verbosity)


def _check_verbosity(verbosity):
    if verbosity not in range(-1, 7):
        raise ValueError("Verbosity has to be int {0, 1, 2, 3, 4, 5}")

# ==============================================================================
# Random integer generator for pseudo random generator (or other things)
# ==============================================================================

rand_seed = random.randint(123, 1512412)  # 357422 or 566575
random.seed(rand_seed)


[docs]def randint():
    """Return random integer."""
    return random.randint(51, 523753)


[docs]def randfloat():
    """Return a random float between 0 and 1."""
    return random.random()


[docs]def set_seed(seed):
    """Set the global random seed."""
    global rand_seed
    rand_seed = seed
    random.seed(rand_seed)

# ------------------------------------------------------------------------------
# parallel profile
# ------------------------------------------------------------------------------

# ==============================================================================
# ERROR HANDLING
# ==============================================================================

_error_count = 0  # increases if an error happens
_warning_count = 0  # increases if an error happens


[docs]def error_occured(max_error_count=MAX_ERROR_COUNT):
    """Call this function every time a non-critical error (saving etc) occurs."""
    global _error_count
    _error_count += 1
    if _error_count >= max_error_count:
        raise RuntimeError("Too many errors encountered from different sources")


[docs]def warning_occured():
    """Call this function every time a warning occurs."""
    global _warning_count
    _warning_count += 1

if __name__ == '__main__':
    pass