Skip to content

Commit

Permalink
first pass at downloading crcns datasets
Browse files Browse the repository at this point in the history
🐛 uses six.zip for py2/3 compat


better


fix typo


ignore dataset downloads for code coverage


adds a test
  • Loading branch information
neuromusic authored and Justin Kiggins committed Aug 2, 2018
1 parent 96a863a commit 046abd8
Show file tree
Hide file tree
Showing 8 changed files with 431 additions and 1 deletion.
4 changes: 3 additions & 1 deletion codecov.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
ignore:
- "neuroglia/calcium/oasis/functions.py"
- "neuroglia/datasets/crcns.py"
- "neuroglia/datasets/figshare.py"
- "neuroglia/calcium/oasis/functions.py"
8 changes: 8 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,11 @@ Tensor transformers
:toctree: generated/

tensor.ResponseReducer

Datasets
-------------------

.. autosummary::
:toctree: generated/

datasets.fetch_rat_hippocampus_foraging
1 change: 1 addition & 0 deletions neuroglia/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .crcns import fetch_rat_hippocampus_foraging
from .synthetic_calcium import make_calcium_traces
285 changes: 285 additions & 0 deletions neuroglia/datasets/crcns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
# from os.path import exists
# from os import makedirs, remove
import os
import tarfile
from collections import namedtuple
import requests
import pandas as pd
import numpy as np
from sklearn.datasets.base import _sha256, _pkl_filepath
from sklearn.utils import Bunch
from .utils import get_neuroglia_data_home

try:
from itertools import izip as zip
except ImportError: # must be python3
pass

URL = 'https://portal.nersc.gov/project/crcns/download/index.php'


def get_environ_username():
return os.environ['CRCNS_USER']


def get_environ_password():
return os.environ['CRCNS_PASSWORD']


Payload = namedtuple('Payload',['username','password','fn','submit'])


def _create_payload(username,password,path,filename):
datafile = "{}/{}".format(path,filename)
return dict(
username=username,
password=password,
fn=datafile,
submit='Login'
)


def _create_local_filename(dest,datafile):
if dest is None:
dest = os.cwd()
return os.path.join(
dest,
datafile.split('/')[-1],
)


def crcns_retrieve(request_payload,local_filename):
with requests.Session() as s:
r = s.post(URL,data=request_payload,stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename


def _fetch_crcns_datafile(crcns,local_filename=None,username=None,password=None,chunk_size=1024):

if local_filename is None:
local_filename = crcns.filename

if os.path.exists(local_filename):
checksum = _sha256(local_filename)
if crcns.checksum == checksum:
return local_filename

if username is None:
username = get_environ_username()
if password is None:
password = get_environ_password()

request_payload = _create_payload(
username,
password,
crcns.path,
crcns.filename,
)

crcns_retrieve(request_payload,local_filename)

checksum = _sha256(local_filename)

if crcns.checksum != checksum:
raise IOError("{} has an SHA256 checksum ({}) "
"differing from expected ({}), "
"file may be corrupted.".format(local_filename, checksum,
crcns.checksum))
return local_filename

CRCNSFileMetadata = namedtuple(
'CRCNSFileMetadata',
['filename', 'path', 'checksum'],
)

def read_spikes_from_tar(f):

SPIKES_HZ = 20000

timestamp_files = (
'crcns/hc2/ec014.333/ec014.333.res.1',
'crcns/hc2/ec014.333/ec014.333.res.2',
'crcns/hc2/ec014.333/ec014.333.res.3',
'crcns/hc2/ec014.333/ec014.333.res.4',
'crcns/hc2/ec014.333/ec014.333.res.5',
'crcns/hc2/ec014.333/ec014.333.res.6',
'crcns/hc2/ec014.333/ec014.333.res.7',
'crcns/hc2/ec014.333/ec014.333.res.8',
)

cluster_files = (
'crcns/hc2/ec014.333/ec014.333.clu.1',
'crcns/hc2/ec014.333/ec014.333.clu.2',
'crcns/hc2/ec014.333/ec014.333.clu.3',
'crcns/hc2/ec014.333/ec014.333.clu.4',
'crcns/hc2/ec014.333/ec014.333.clu.5',
'crcns/hc2/ec014.333/ec014.333.clu.6',
'crcns/hc2/ec014.333/ec014.333.clu.7',
'crcns/hc2/ec014.333/ec014.333.clu.8',
)

spikes = []

for timestamps,clusters in zip(timestamp_files,cluster_files):
shank = int(timestamps[-1])
#print timestamps,clusters
time = 0

ts = f.extractfile(timestamps)
clu = f.extractfile(clusters)
for frame,cluster in zip(ts.readlines(),clu.readlines()):
if int(cluster)>1:
spike = dict(
time=float(frame) / SPIKES_HZ,
neuron='{}-{:02d}'.format(shank,int(cluster)),
# shank=shank,
)
spikes.append(spike)

spikes = pd.DataFrame(spikes)
return spikes

def read_location_from_tar(f):

LOCATION_HZ = 39.06

location_file = 'crcns/hc2/ec014.333/ec014.333.whl'
loc = pd.read_csv(
f.extractfile(location_file),
sep='\t',
header=0,
names=['x','y','x2','y2'],
)
loc = loc.replace(-1.0,np.nan)
loc['time'] = loc.index / LOCATION_HZ
loc = loc.dropna()
return loc




def load_hc2(tar_path):

with tarfile.open(mode="r:gz", name=tar_path) as f:
spikes = read_spikes_from_tar(f)
location = read_location_from_tar(f)

# truncate neuronal data to time when mouse is exploring
min_time = location['time'].min()
max_time = location['time'].max()

spikes = spikes[
(spikes['time'] >= min_time)
& (spikes['time'] <= max_time)
]

# set approx center of arena to zero in x & y
x0 = np.mean([location['x2'].max(),location['x2'].min()])
y0 = np.mean([location['y2'].max(),location['y2'].min()])

location['x'] -= x0
location['x2'] -= x0
location['y'] -= y0
location['y2'] -= y0

return spikes, location


def fetch_rat_hippocampus_foraging(data_home=None,username=None,password=None,download_if_missing=True):
"""Loader for experiment ec014.333 from the HC-2 dataset on crcns.org
More info on this dataset: https://crcns.org/data-sets/hc/hc-2/about-hc-2
To download this data, you must have a CRCNS account. Request an account
at https://crcns.org/request-account/
Warning! The first time you run this function, it will download a 3.3GB file.
Parameters
----------
data_home : optional, default: None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
username : optional, default: None
CRCNS username. All CRCNS datasets need a username to login. If `None`
(default), the `CRCNS_USERNAME` environment variable is used.
password : optional, default: None
CRCNS username & password. All CRCNS datasets need a username to login. If `None`
(default), the `CRCNS_USERNAME` environment variable is used.
download_if_missing : optional, default=True
If False, raise a IOError if the data is not locally available
instead of trying to download the data from the source site.
Returns
-------
dataset : dict-like object with the following attributes:
dataset.spikes : dataframe, shape [20640, 2]
Each row is a single spike at `time` elicited from neuron `neuron`
dataset.location : dataframe, shape (20640,)
Each row is a sample of the rat's position, with the location of the
head designated by (x,y) and the location of the back designated by
(x2, y2)
Notes
------
This dataset consists of 58 simultaneously recorded neurons from the rat
hippocampus along with coordinates of its position while it forages in an
open arena (180cm x 180cm) for 92 minutes.
References
----------
Mizuseki K, Sirota A, Pastalkova E, Buzsaki G. (2009): Multi-unit recordings
from the rat hippocampus made during open field foraging
http://dx.doi.org/10.6080/K0Z60KZ9
"""


data_home = get_neuroglia_data_home(data_home=data_home)
if not os.path.exists(data_home):
os.makedirs(data_home)

# check if local files exist. if so, load, otherwise download raw

spikes_path = _pkl_filepath(data_home, 'crcns_hc2_spikes.pkl')
location_path = _pkl_filepath(data_home, 'crcns_hc2_location.pkl')


if (not os.path.exists(spikes_path)) or (not os.path.exists(location_path)):
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

tar_path = os.path.join(data_home,'crcns_hc2.tar.gz')

crcns = CRCNSFileMetadata(
path = "hc-2/ec014.333",
filename = "ec014.333.tar.gz",
checksum = '819d9060bcdd439a2024ee44cfb3e7be45056632af052e524e0e23f139c6a260',
)

local_filename = _fetch_crcns_datafile(
crcns=crcns,
local_filename=tar_path,
username=username,
password=password,
)

spikes, location = load_hc2(tar_path)

spikes.to_pickle(spikes_path)
location.to_pickle(location_path)

os.remove(tar_path)
else:
spikes = pd.read_pickle(spikes_path)
location = pd.read_pickle(location_path)

return Bunch(
spikes=spikes,
location=location
)
Loading

0 comments on commit 046abd8

Please sign in to comment.