From 3b398d3f4410e1c30ecc0a89bfab127aaac2c4c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 16 May 2019 13:32:52 +0200 Subject: [PATCH 1/4] Write sample_fraction into hdf files in split_data --- aict_tools/scripts/split_data.py | 13 +++++++++++++ tests/test_executables.py | 21 +++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/aict_tools/scripts/split_data.py b/aict_tools/scripts/split_data.py index 6148c37..72e6bd5 100644 --- a/aict_tools/scripts/split_data.py +++ b/aict_tools/scripts/split_data.py @@ -7,6 +7,7 @@ from math import ceil import h5py from tqdm import tqdm +import h5py log = logging.getLogger() @@ -117,6 +118,10 @@ def split_multi_telescope_data(input_path, output_basename, fraction, name): write_data(selected_runs, path, key='runs', use_h5py=True, mode='w') write_data(selected_array_events, path, key='array_events', use_h5py=True, mode='a') write_data(selected_telescope_events, path, key='telescope_events', use_h5py=True, mode='a') + + with h5py.File(path, 'r+') as f: + f.attrs['sample_fraction'] = n / n_total + log.debug(f'selected runs {set(selected_run_ids)}') log.debug(f'Runs minus selected runs {ids - set(selected_run_ids)}') ids = ids - set(selected_run_ids) @@ -160,6 +165,11 @@ def split_single_telescope_data_chunked(input_path, output_basename, inkey, key, )) write_data(selected_data, path, key=key, use_h5py=True, mode=mode) + for n, part_name in zip(num_ids, name): + path = output_basename + '_' + part_name + '.hdf5' + with h5py.File(path, mode='r+') as f: + f.attrs['sample_fraction'] = n / n_total + def split_single_telescope_data(input_path, output_basename, fmt, inkey, key, fraction, name): @@ -184,6 +194,9 @@ def split_single_telescope_data(input_path, output_basename, fmt, inkey, key, fr log.info('Writing {} telescope-array events to: {}'.format(n, path)) write_data(selected_data, path, key=key, use_h5py=True, mode='w') + with h5py.File(path, mode='r+') as f: + f.attrs['sample_fraction'] = n / n_total + elif fmt == 'csv': filename = output_basename + '_' + part_name + '.csv' log.info('Writing {} telescope-array events to: {}'.format(n, filename)) diff --git a/tests/test_executables.py b/tests/test_executables.py index 7aab8d9..0c68cdf 100644 --- a/tests/test_executables.py +++ b/tests/test_executables.py @@ -3,6 +3,7 @@ from click.testing import CliRunner import shutil from traceback import print_exception +import h5py def test_train_regressor(): @@ -299,9 +300,9 @@ def test_split_data_executable(): os.path.join(d, 'gamma.hdf5'), os.path.join(d, 'signal'), '-ntest', # no spaces here. maybe a bug in click? - '-f0.5', + '-f0.75', '-ntrain', - '-f0.5', + '-f0.25', ] ) if result.exit_code != 0: @@ -313,9 +314,15 @@ def test_split_data_executable(): test_path = os.path.join(d, 'signal_test.hdf5') assert os.path.isfile(test_path) + with h5py.File(test_path, 'r') as f: + assert f.attrs['sample_fraction'] == 0.75 + train_path = os.path.join(d, 'signal_train.hdf5') assert os.path.isfile(train_path) + with h5py.File(train_path, 'r') as f: + assert f.attrs['sample_fraction'] == 0.25 + def test_split_data_executable_chunked(): from aict_tools.scripts.split_data import main as split @@ -331,9 +338,9 @@ def test_split_data_executable_chunked(): os.path.join(d, 'gamma.hdf5'), os.path.join(d, 'signal'), '-ntest', # no spaces here. maybe a bug in click? - '-f0.5', + '-f0.75', '-ntrain', - '-f0.5', + '-f0.25', '--chunksize=100', ] ) @@ -346,5 +353,11 @@ def test_split_data_executable_chunked(): test_path = os.path.join(d, 'signal_test.hdf5') assert os.path.isfile(test_path) + with h5py.File(test_path, 'r') as f: + assert f.attrs['sample_fraction'] == 0.75 + train_path = os.path.join(d, 'signal_train.hdf5') assert os.path.isfile(train_path) + + with h5py.File(train_path, 'r') as f: + assert f.attrs['sample_fraction'] == 0.25 From bde42ad5aa9f0c3a8f2c2be5b46c2abc39940dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 16 May 2019 13:51:59 +0200 Subject: [PATCH 2/4] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c2a460a..aad2de4 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='aict_tools', - version='0.14.0', + version='0.14.1', description='Artificial Intelligence for Imaging Atmospheric Cherenkov Telescopes', long_description=long_description, long_description_content_type='text/markdown', From 272b43a501413eef939de03911ae0368eff25e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 16 May 2019 14:37:51 +0200 Subject: [PATCH 3/4] Add sample_fraction also in to_dl3 --- aict_tools/scripts/fact_to_dl3.py | 8 ++++++++ examples/gamma.hdf5 | Bin 2849264 -> 2849384 bytes tests/test_executables.py | 6 +++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/aict_tools/scripts/fact_to_dl3.py b/aict_tools/scripts/fact_to_dl3.py index 7ff9f6c..f4e6565 100644 --- a/aict_tools/scripts/fact_to_dl3.py +++ b/aict_tools/scripts/fact_to_dl3.py @@ -6,6 +6,7 @@ import pandas as pd from functools import partial import os +import h5py from astropy.time import Time from astropy.coordinates import AltAz, SkyCoord @@ -369,6 +370,13 @@ def main( else: to_h5py(df[dl3_columns_sim], output, key='events', mode='a') + with h5py.File(data_path, 'r') as f: + sample_fraction = f.attrs.get('sample_fraction') + + if sample_fraction is not None: + with h5py.File(output, 'r+') as f: + f.attrs['sample_fraction'] = sample_fraction + if source: log.info('Copying "runs" group') to_h5py(runs, output, key='runs', mode='a') diff --git a/examples/gamma.hdf5 b/examples/gamma.hdf5 index 65122719c534cb38c052d6d434f5c0b2551ecc68..a6cf8f1ac0f06a792f9594b58358fef4b79562bf 100644 GIT binary patch delta 244 zcmW-cO)f)W07bu7MNyQty!vbXS35Nj6RboUA(~JP25ROcXG~)q9f%m%f|VFHVFBKA zZf@@M+~Ln4`u-h6FZImjW501w%Z4a~5GPG3BuN?en*aL{o{&f~G9p#`x6vL)NHBfCgSdv$(#aoak(Z{KuYpUahEdL^ZO`1QH}esYu?>c5r$0FL5H AQ2+n{ delta 158 zcmV~$w++Go002RmoH04)Y%jn_tl*O3r7uAgOhL&`lt@rea`*Z^g7@bUT=uDVa*d9+ zWx9L+C2@Eb159BWGnmC3=COc9EMXZdSj8IFF~kNov4w5yU>AGX#{mv;gkzlG6lXZc V1uk)gYuw-#ceuv`Mo;5Y@&};AK0W{d diff --git a/tests/test_executables.py b/tests/test_executables.py index 0c68cdf..ac5d790 100644 --- a/tests/test_executables.py +++ b/tests/test_executables.py @@ -267,6 +267,7 @@ def test_to_dl3(): print_exception(*result.exc_info) assert result.exit_code == 0 + output = os.path.join(d, 'gamma_dl3.hdf5') result = runner.invoke( to_dl3, [ @@ -276,7 +277,7 @@ def test_to_dl3(): os.path.join(d, 'regressor.pkl'), os.path.join(d, 'disp.pkl'), os.path.join(d, 'sign.pkl'), - os.path.join(d, 'gamma_dl3.hdf5'), + output, ] ) @@ -285,6 +286,9 @@ def test_to_dl3(): print_exception(*result.exc_info) assert result.exit_code == 0 + with h5py.File(output) as f: + assert f.attrs['sample_fraction'] == 1000 / 1851297 + def test_split_data_executable(): from aict_tools.scripts.split_data import main as split From 8d079b1be5d53cd0dc16ceb7b38ca13e8a68ea5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 16 May 2019 14:42:23 +0200 Subject: [PATCH 4/4] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d46e51e..745d476 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='aict_tools', - version='0.15.0', + version='0.16.0', description='Artificial Intelligence for Imaging Atmospheric Cherenkov Telescopes', long_description=long_description, long_description_content_type='text/markdown',