Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change formula for chunking of files in combine #642

Merged
merged 19 commits into from
Jul 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ env:
# to repeat them for all configurations.
- NUMPY_VERSION=stable
- ASTROPY_VERSION=stable
- CONDA_DEPENDENCIES='scipy reproject psutil cython astroscrappy scikit-image'
- CONDA_DEPENDENCIES='scipy reproject psutil cython astroscrappy scikit-image memory_profiler'
- PIP_DEPENDENCIES=''
- MAIN_CMD='python setup.py'
- SETUP_CMD='test'
Expand Down
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ Bug Fixes

- Function ``combine`` avoids keeping files open unnecessarily. [#629, #630]

- Function ``combine`` more accurately estimates memory use
when deciding how to chunk files. [#638, #642]

- Raise ``ValueError`` error in ``subtract_dark`` for when the errors have
different shapes [#674, #677]

Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ environment:
PYTHON_ARCH: "64" # needs to be set for CMD_IN_ENV to succeed. If a mix
# of 32 bit and 64 bit builds are needed, move this
# to the matrix section.
CONDA_DEPENDENCIES: "scipy reproject cython astroscrappy"
CONDA_DEPENDENCIES: "scipy reproject cython astroscrappy scikit-image memory_profiler"
# Need the latest scikit-image (0.14.2 or higher), which is not in
# anaconda yet.
PIP_DEPENDENCIES: "scikit-image"
Expand Down
66 changes: 41 additions & 25 deletions ccdproc/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,36 @@ def _calculate_step_sizes(x_size, y_size, num_chunks):
return xstep, ystep


def _calculate_size_of_image(ccd,
combine_uncertainty_function):
# If uncertainty_func is given for combine this will create an uncertainty
# even if the originals did not have one. In that case we need to create
# an empty placeholder.
if ccd.uncertainty is None and combine_uncertainty_function is not None:
ccd.uncertainty = StdDevUncertainty(np.zeros(ccd.data.shape))

size_of_an_img = ccd.data.nbytes
try:
size_of_an_img += ccd.uncertainty.array.nbytes
# In case uncertainty is None it has no "array" and in case the "array" is
# not a numpy array:
except AttributeError:
pass
# Mask is enforced to be a numpy.array across astropy versions
if ccd.mask is not None:
size_of_an_img += ccd.mask.nbytes
# flags is not necessarily a numpy array so do not fail with an
# AttributeError in case something was set!
# TODO: Flags are not taken into account in Combiner. This number is added
# nevertheless for future compatibility.
try:
size_of_an_img += ccd.flags.nbytes
except AttributeError:
pass

return size_of_an_img


def combine(img_list, output_file=None,
method='average', weights=None, scale=None, mem_limit=16e9,
clip_extrema=False, nlow=1, nhigh=1,
Expand Down Expand Up @@ -662,12 +692,6 @@ def combine(img_list, output_file=None,
# User has provided fits filenames to read from
ccd = CCDData.read(img_list[0], **ccdkwargs)

# If uncertainty_func is given for combine this will create an uncertainty
# even if the originals did not have one. In that case we need to create
# an empty placeholder.
if ccd.uncertainty is None and combine_uncertainty_function is not None:
ccd.uncertainty = StdDevUncertainty(np.zeros(ccd.data.shape))

if dtype is None:
dtype = np.float64

Expand All @@ -677,29 +701,21 @@ def combine(img_list, output_file=None,
if ccd.data.dtype != dtype:
ccd.data = ccd.data.astype(dtype)

size_of_an_img = ccd.data.nbytes
try:
size_of_an_img += ccd.uncertainty.array.nbytes
# In case uncertainty is None it has no "array" and in case the "array" is
# not a numpy array:
except AttributeError:
pass
# Mask is enforced to be a numpy.array across astropy versions
if ccd.mask is not None:
size_of_an_img += ccd.mask.nbytes
# flags is not necessarily a numpy array so do not fail with an
# AttributeError in case something was set!
# TODO: Flags are not taken into account in Combiner. This number is added
# nevertheless for future compatibility.
try:
size_of_an_img += ccd.flags.nbytes
except AttributeError:
pass
size_of_an_img = _calculate_size_of_image(ccd,
combine_uncertainty_function)

no_of_img = len(img_list)

# Set a memory use factor based on profiling
if method == 'median':
memory_factor = 3
else:
memory_factor = 2

memory_factor *= 1.5

# determine the number of chunks to split the images into
no_chunks = int((size_of_an_img * no_of_img) / mem_limit) + 1
no_chunks = int((memory_factor * size_of_an_img * no_of_img) / mem_limit) + 1
if no_chunks > 1:
log.info('splitting each image into {0} chunks to limit memory usage '
'to {1} bytes.'.format(no_chunks, mem_limit))
Expand Down
185 changes: 185 additions & 0 deletions ccdproc/tests/run_for_memory_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from argparse import ArgumentParser
from tempfile import TemporaryDirectory
from pathlib import Path
import sys
import gc

import psutil
from memory_profiler import memory_usage

import numpy as np
from astropy.io import fits
from astropy.stats import median_absolute_deviation
from astropy.nddata import CCDData

# This bit of hackery ensures that we can see ccdproc from within
# the test suite
sys.path.append(str(Path().cwd()))
from ccdproc import combine, ImageFileCollection

try:
from ccdproc.combiner import _calculate_size_of_image
except ImportError:
def _calculate_size_of_image(ccd,
combine_uncertainty_function):
# If uncertainty_func is given for combine this will create an uncertainty
# even if the originals did not have one. In that case we need to create
# an empty placeholder.
if ccd.uncertainty is None and combine_uncertainty_function is not None:
ccd.uncertainty = StdDevUncertainty(np.zeros(ccd.data.shape))

size_of_an_img = ccd.data.nbytes
try:
size_of_an_img += ccd.uncertainty.array.nbytes
# In case uncertainty is None it has no "array" and in case the "array" is
# not a numpy array:
except AttributeError:
pass
# Mask is enforced to be a numpy.array across astropy versions
if ccd.mask is not None:
size_of_an_img += ccd.mask.nbytes
# flags is not necessarily a numpy array so do not fail with an
# AttributeError in case something was set!
# TODO: Flags are not taken into account in Combiner. This number is added
# nevertheless for future compatibility.
try:
size_of_an_img += ccd.flags.nbytes
except AttributeError:
pass

return size_of_an_img


# Do not combine these into one statement. When all references are lost
# to a TemporaryDirectory the directory is automatically deleted. _TMPDIR
# creates a reference that will stick around.
_TMPDIR = TemporaryDirectory()
TMPPATH = Path(_TMPDIR.name)


def generate_fits_files(n_images, size=None, seed=1523):
if size is None:
use_size = (2024, 2031)
else:
use_size = (size, size)

np.random.seed(seed)

base_name = 'test-combine-{num:03d}.fits'

for num in range(n_images):
data = np.random.normal(size=use_size)
# Now add some outlying pixels so there is something to clip
n_bad = 50000
bad_x = np.random.randint(0, high=use_size[0] - 1, size=n_bad)
bad_y = np.random.randint(0, high=use_size[1] - 1, size=n_bad)
data[bad_x, bad_y] = (np.random.choice([-1, 1], size=n_bad) *
(10 + np.random.rand(n_bad)))
hdu = fits.PrimaryHDU(data=np.asarray(data, dtype='float32'))
hdu.header['for_prof'] = 'yes'
hdu.header['bunit'] = 'adu'
path = TMPPATH.resolve() / base_name.format(num=num)
hdu.writeto(path, overwrite=True)


def run_memory_profile(n_files, sampling_interval, size=None, sigma_clip=False,
combine_method=None, memory_limit=None):
"""
Try opening a bunch of files with a relatively low limit on the number
of open files.

Parameters
----------

n_files : int
Number of files to combine.

sampling_interval : float
Time, in seconds, between memory samples.

size : int, optional
Size of one side of the image (the image is always square).

sigma_clip : bool, optional
If true, sigma clip the data before combining.

combine_method : str, optional
Should be one of the combine methods accepted by
ccdproc.combine

memory_limit : int, optional
Cap on memory use during image combination.
"""
# Do a little input validation
if n_files <= 0:
raise ValueError("Argument 'n' must be a positive integer")

proc = psutil.Process()

print('Process ID is: ', proc.pid, flush=True)
ic = ImageFileCollection(str(TMPPATH))
files = ic.files_filtered(for_prof='yes', include_path=True)

kwargs = {'method': combine_method}

if sigma_clip:
kwargs.update(
{'sigma_clip': True,
'sigma_clip_low_thresh': 5,
'sigma_clip_high_thresh': 5,
'sigma_clip_func': np.ma.median,
'sigma_clip_dev_func': median_absolute_deviation}
)

ccd = CCDData.read(files[0])
expected_img_size = _calculate_size_of_image(ccd, None)

if memory_limit:
kwargs['mem_limit'] = memory_limit

pre_mem_use = memory_usage(-1, interval=sampling_interval, timeout=1)
baseline = np.mean(pre_mem_use)
print('Subtracting baseline memory before profile: {}'.format(baseline))
mem_use = memory_usage((combine, (files,), kwargs),
interval=sampling_interval, timeout=None)
mem_use = [m - baseline for m in mem_use]
return mem_use, expected_img_size


if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('number', type=int,
help='Number of files to combine.')
parser.add_argument('--size', type=int, action='store',
help='Size of one side of image to create. '
'All images are square, so only give '
'a single number for the size.')
parser.add_argument('--combine-method', '-c',
choices=('average', 'median'),
help='Method to use to combine images.')
parser.add_argument('--memory-limit', type=int,
help='Limit combination to this amount of memory')
parser.add_argument('--sigma-clip', action='store_true',
help='If set, sigma-clip before combining. Clipping '
'will be done with high/low limit of 5. '
'The central function is the median, the '
'deviation is the median_absolute_deviation.')
parser.add_argument('--sampling-freq', type=float, default=0.05,
help='Time, in seconds, between memory samples.')
parser.add_argument('--frequent-gc', action='store_true',
help='If set, perform garbage collection '
'much more frequently than the default.')
args = parser.parse_args()

if args.frequent_gc:
gc.set_threshold(10, 10, 10)

print("Garbage collection thresholds: ", gc.get_threshold())

mem_use = run_with_limit(args.number, args.sampling_freq,
size=args.size,
sigma_clip=args.sigma_clip,
combine_method=args.combine_method,
memory_limit=args.memory_limit)
print('Max memory usage (MB): ', np.max(mem_use))
print('Baseline memory usage (MB): ', mem_use[0])
Loading