Skip to content

vcf_to_dataframe #252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 30 additions & 15 deletions allel/io/vcf_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

def _prep_fields_param(fields):
"""Prepare the `fields` parameter, and determine whether or not to store samples."""

store_samples = False

if fields is None:
Expand All @@ -67,7 +66,6 @@ def _prep_fields_param(fields):
store_samples = True
elif '*' in fields:
store_samples = True

return store_samples, fields


Expand Down Expand Up @@ -235,6 +233,11 @@ def _chunk_iter_rename(it, rename_fields):
_doc_param_log = \
"""A file-like object (e.g., `sys.stderr`) to print progress information."""

_doc_param_calldata = \
"""Return in the dataframe also the information in the 'calldata' group.
It should be used carefully with large population vcf and combained
with samples parameter for reduce memory usage of the returned DF"""


# noinspection PyShadowingBuiltins
def read_vcf(input,
Expand Down Expand Up @@ -301,15 +304,13 @@ def read_vcf(input,
# samples requested?
# noinspection PyTypeChecker
store_samples, fields = _prep_fields_param(fields)

# setup
fields, samples, headers, it = iter_vcf_chunks(
input=input, fields=fields, exclude_fields=exclude_fields, types=types,
numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
samples=samples, transformers=transformers
)

# handle field renaming
if rename_fields:
rename_fields, it = _do_rename(it, fields=fields,
Expand Down Expand Up @@ -1133,7 +1134,6 @@ def iter_vcf_chunks(input,
Chunk iterator.

"""

# setup commmon keyword args
kwds = dict(fields=fields, exclude_fields=exclude_fields, types=types,
numbers=numbers, alt_number=alt_number, chunk_length=chunk_length,
Expand All @@ -1145,7 +1145,6 @@ def iter_vcf_chunks(input,

# setup iterator
fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)

# setup transformers
if transformers is not None:
# API flexibility
Expand Down Expand Up @@ -1774,13 +1773,14 @@ def _read_vcf_headers(stream):
return VCFHeaders(headers, filters, infos, formats, samples)


def _chunk_to_dataframe(fields, chunk):
def _chunk_to_dataframe(fields, chunk, samples=[]):
import pandas
items = list()
for f in fields:
a = chunk[f]
group, name = f.split('/')
assert group == 'variants'
if samples == []:
assert group == 'variants'
if a.dtype.kind == 'S':
# always convert strings for pandas - if U then pandas will use object dtype
a = a.astype('U')
Expand All @@ -1789,6 +1789,11 @@ def _chunk_to_dataframe(fields, chunk):
elif a.ndim == 2:
for i in range(a.shape[1]):
items.append(('%s_%s' % (name, i + 1), a[:, i]))
elif a.ndim == 3:
assert group == 'calldata'
for sample in range(a.shape[1]):
for i in range(a.shape[2]):
items.append(('%s_%s_%s' % (name, samples[sample], i + 1), a[:, sample, i]))
else:
warnings.warn('cannot handle array %r with >2 dimensions, skipping' % name)
df = pandas.DataFrame.from_dict(OrderedDict(items))
Expand All @@ -1807,6 +1812,8 @@ def vcf_to_dataframe(input,
fills=None,
region=None,
tabix='tabix',
calldata=False,
samples=None,
transformers=None,
buffer_size=DEFAULT_BUFFER_SIZE,
chunk_length=DEFAULT_CHUNK_LENGTH,
Expand All @@ -1833,6 +1840,10 @@ def vcf_to_dataframe(input,
{region}
tabix : string, optional
{tabix}
calldata : bool, optional
{calldata}
samples : list of string, optional
{samples}
transformers : list of transformer objects, optional
{transformers}
buffer_size : int, optional
Expand All @@ -1841,7 +1852,6 @@ def vcf_to_dataframe(input,
{chunk_length}
log : file-like, optional
{log}

Returns
-------
df : pandas.DataFrame
Expand All @@ -1852,16 +1862,19 @@ def vcf_to_dataframe(input,

# samples requested?
# noinspection PyTypeChecker
_, fields = _prep_fields_param(fields)

if calldata:
fields = '*'
store_sample, fields = _prep_fields_param(fields)
else:
_, fields = _prep_fields_param(fields)
samples = []
# setup
fields, _, _, it = iter_vcf_chunks(
fields, samples, _, it = iter_vcf_chunks(
input=input, fields=fields, exclude_fields=exclude_fields, types=types,
numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[],
chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=samples,
transformers=transformers
)

# setup progress logging
if log is not None:
it = _chunk_iter_progress(it, log, prefix='[vcf_to_dataframe]')
Expand All @@ -1875,7 +1888,7 @@ def vcf_to_dataframe(input,
if chunks:

# concatenate chunks
output = pandas.concat([_chunk_to_dataframe(fields, chunk)
output = pandas.concat([_chunk_to_dataframe(fields, chunk, samples)
for chunk in chunks])

return output
Expand All @@ -1895,6 +1908,8 @@ def vcf_to_dataframe(input,
buffer_size=_doc_param_buffer_size,
chunk_length=_doc_param_chunk_length,
log=_doc_param_log,
calldata=_doc_param_calldata,
samples=_doc_param_samples
)


Expand Down