Source code for glue.core.data_factories.pandas

from __future__ import absolute_import, division, print_function

import numpy as np

from glue.external import six
from glue.core.data_factories.helpers import has_extension
from glue.core.component import Component, CategoricalComponent
from glue.core.data import Data
from glue.config import data_factory


__all__ = ['pandas_read_table']


def panda_process(indf):
    """
    Build a data set from a table using pandas. This attempts to respect
    categorical data input by letting pandas.read_csv infer the type

    """
    result = Data()
    for name, column in indf.iteritems():
        if (column.dtype == np.object) | (column.dtype == np.bool):
            # try to salvage numerical data
            coerced = column.convert_objects(convert_numeric=True)
            if (coerced.dtype != column.dtype) and coerced.isnull().mean() < 0.4:
                c = Component(coerced.values)
            else:
                # pandas has a 'special' nan implementation and this doesn't
                # play well with np.unique
                c = CategoricalComponent(column.fillna(''))
        else:
            c = Component(column.values)

        # convert header to string - in some cases if the first row contains
        # numbers, these are cast to numerical types, so we want to change that
        # here.
        if not isinstance(name, six.string_types):
            name = str(name)

        # strip off leading #
        name = name.strip()
        if name.startswith('#'):
            name = name[1:].strip()

        result.add_component(c, name)

    return result


@data_factory(label="Pandas Table", identifier=has_extension('csv csv txt tsv tbl dat'))
[docs]def pandas_read_table(path, **kwargs):
    """ A factory for reading tabular data using pandas
    :param path: path/to/file
    :param kwargs: All kwargs are passed to pandas.read_csv
    :returns: :class:`glue.core.data.Data` object
    """
    import pandas as pd
    try:
        from pandas.parser import CParserError
    except ImportError:  # pragma: no cover
        from pandas._parser import CParserError

    # iterate over common delimiters to search for best option
    delimiters = kwargs.pop('delimiter', [None] + list(',|\t '))

    fallback = None

    for d in delimiters:
        try:
            indf = pd.read_csv(path, delimiter=d, **kwargs)

            # ignore files parsed to empty dataframes
            if len(indf) == 0:
                continue

            # only use files parsed to single-column dataframes
            # if we don't find a better strategy
            if len(indf.columns) < 2:
                fallback = indf
                continue

            return panda_process(indf)

        except CParserError:
            continue

    if fallback is not None:
        return panda_process(fallback)
    raise IOError("Could not parse %s using pandas" % path)