diff --git a/graphistry/Engine.py b/graphistry/Engine.py index e514a69195..6aab8bb18c 100644 --- a/graphistry/Engine.py +++ b/graphistry/Engine.py @@ -2,7 +2,7 @@ import pandas as pd from typing import Any, Optional, Union from enum import Enum -from graphistry.utils.lazy_import import lazy_cudf_import +from .utils.dep_manager import deps class Engine(Enum): @@ -49,14 +49,14 @@ def resolve_engine( return Engine.PANDAS if 'cudf.core.dataframe' in str(getmodule(g_or_df)): - has_cudf_dependancy_, _, _ = lazy_cudf_import() + has_cudf_dependancy_ = deps.cudf if has_cudf_dependancy_: import cudf if isinstance(g_or_df, cudf.DataFrame): return Engine.CUDF raise ValueError(f'Expected cudf dataframe, got: {type(g_or_df)}') - has_cudf_dependancy_, _, _ = lazy_cudf_import() + has_cudf_dependancy_ = deps.cudf if has_cudf_dependancy_: return Engine.CUDF return Engine.PANDAS diff --git a/graphistry/compute/ast.py b/graphistry/compute/ast.py index d478c23f28..e9cc868965 100644 --- a/graphistry/compute/ast.py +++ b/graphistry/compute/ast.py @@ -299,7 +299,7 @@ def from_json(cls, d: dict) -> 'ASTEdge': direction=d['direction'] if 'direction' in d else None, edge_match=maybe_filter_dict_from_json(d, 'edge_match'), hops=d['hops'] if 'hops' in d else None, - to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else None, + to_fixed_point=d['to_fixed_point'] if 'to_fixed_point' in d else None, # type: ignore source_node_match=maybe_filter_dict_from_json(d, 'source_node_match'), destination_node_match=maybe_filter_dict_from_json(d, 'destination_node_match'), source_node_query=d['source_node_query'] if 'source_node_query' in d else None, diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 2d742b422b..b31a557175 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -10,7 +10,8 @@ from graphistry.constants import CUML, UMAP_LEARN, DBSCAN # noqa type: ignore from graphistry.features import ModelDict from graphistry.feature_utils import get_matrix_by_column_parts -from graphistry.utils.lazy_import import lazy_cudf_import, lazy_dbscan_import +from graphistry.utils.lazy_import import lazy_dbscan_import, make_safe_gpu_dataframes +from graphistry.utils.dep_manager import deps logger = logging.getLogger("compute.cluster") @@ -46,28 +47,6 @@ def resolve_cpu_gpu_engine( f"but received: {engine} :: {type(engine)}" ) -def make_safe_gpu_dataframes(X, y, engine): - """helper method to coerce a dataframe to the correct type (pd vs cudf)""" - def safe_cudf(X, y): - new_kwargs = {} - kwargs = {'X': X, 'y': y} - for key, value in kwargs.items(): - if isinstance(value, cudf.DataFrame) and engine in ["pandas", 'sklearn', 'umap_learn']: - new_kwargs[key] = value.to_pandas() - elif isinstance(value, pd.DataFrame) and engine == "cuml": - new_kwargs[key] = cudf.from_pandas(value) - else: - new_kwargs[key] = value - return new_kwargs['X'], new_kwargs['y'] - - has_cudf_dependancy_, _, cudf = lazy_cudf_import() - if has_cudf_dependancy_: - # print('DBSCAN CUML Matrices') - return safe_cudf(X, y) - else: - return X, y - - def get_model_matrix(g, kind: str, cols: Optional[Union[List, str]], umap, target): """ Allows for a single function to get the model matrix for both nodes and edges as well as targets, embeddings, and features diff --git a/graphistry/dgl_utils.py b/graphistry/dgl_utils.py index b3d82418ba..90a627a18e 100644 --- a/graphistry/dgl_utils.py +++ b/graphistry/dgl_utils.py @@ -5,10 +5,6 @@ import numpy as np import pandas as pd -from graphistry.utils.lazy_import import ( - lazy_dgl_import, - lazy_torch_import_has_dependency -) from . import constants as config from .feature_utils import ( FeatureEngine, @@ -21,7 +17,7 @@ ) from .util import setup_logger - +from graphistry.utils.dep_manager import deps if TYPE_CHECKING: import scipy @@ -41,7 +37,6 @@ logger = setup_logger(name=__name__) - # ######################################################################################### # # Torch helpers @@ -57,7 +52,7 @@ def convert_to_torch(X_enc: pd.DataFrame, y_enc: Optional[pd.DataFrame]): # typ :param y_enc: DataFrame Matrix of Values for Target :return: Dictionary of torch encoded arrays """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 if not y_enc.empty: # type: ignore data = { @@ -82,7 +77,7 @@ def get_available_devices(): device (torch.device): Main device (GPU 0 or CPU). gpu_ids (list): List of IDs of all GPUs that are available. """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 gpu_ids = [] if torch.cuda.is_available(): @@ -165,7 +160,7 @@ def pandas_to_dgl_graph( sp_mat: sparse scipy matrix ordered_nodes_dict: dict ordered from most common src and dst nodes """ - _, _, dgl = lazy_dgl_import() # noqa: F811 + dgl = deps.dgl # noqa: F811 sp_mat, ordered_nodes_dict = pandas_to_sparse_adjacency(df, src, dst, weight_col) g = dgl.from_scipy(sp_mat, device=device) # there are other ways too logger.info(f"Graph Type: {type(g)}") @@ -180,7 +175,7 @@ def get_torch_train_test_mask(n: int, ratio: float = 0.8): :param ratio: mimics train/test split. `ratio` sets number of True vs False mask entries. :return: train and test torch tensor masks """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 train_mask = torch.zeros(n, dtype=torch.bool).bernoulli(ratio) test_mask = ~train_mask @@ -209,8 +204,6 @@ def dgl_lazy_init(self, train_split: float = 0.8, device: str = "cpu"): """ if not self.dgl_initialized: - lazy_dgl_import() - lazy_torch_import_has_dependency() self.train_split = train_split self.device = device self._removed_edges_previously = False diff --git a/graphistry/embed_utils.py b/graphistry/embed_utils.py index 7f68faa2c9..9988a74b4f 100644 --- a/graphistry/embed_utils.py +++ b/graphistry/embed_utils.py @@ -2,22 +2,14 @@ import numpy as np import pandas as pd from typing import Optional, Union, Callable, List, TYPE_CHECKING, Any, Tuple - -from graphistry.utils.lazy_import import lazy_embed_import +from inspect import getmodule from .PlotterBase import Plottable from .compute.ComputeMixin import ComputeMixin +from .utils.dep_manager import deps -def check_cudf(): - try: - import cudf - return True, cudf - except: - return False, object - - if TYPE_CHECKING: - _, torch, _, _, _, _, _, _ = lazy_embed_import() + import torch TT = torch.Tensor MIXIN_BASE = ComputeMixin else: @@ -25,7 +17,7 @@ def check_cudf(): MIXIN_BASE = object torch = Any -has_cudf, cudf = check_cudf() +cudf = deps.cudf XSymbolic = Optional[Union[List[str], str, pd.DataFrame]] ProtoSymbolic = Optional[Union[str, Callable[[TT, TT, TT], TT]]] # type: ignore @@ -86,8 +78,7 @@ def __init__(self): self._device = "cpu" def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) -> Plottable: - #_, torch, _, _, _, _, _, _ = lazy_embed_import_dep() - import torch + torch = deps.torch log('Preprocessing embedding data') src, dst = res._source, res._destination relation = res._relation @@ -134,7 +125,7 @@ def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) - return res def _build_graph(self, res) -> Plottable: - _, _, _, dgl, _, _, _, _ = lazy_embed_import() + dgl = deps.dgl s, r, t = res._triplets.T if res._train_idx is not None: @@ -156,7 +147,10 @@ def _build_graph(self, res) -> Plottable: def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, device): - _, _, _, _, GraphDataLoader, HeteroEmbed, _, _ = lazy_embed_import() + dgl_ = deps.dgl + if dgl_: + from dgl.dataloading import GraphDataLoader + from .networks import HeteroEmbed g_iter = SubgraphIterator(res._kg_dgl, sample_size, num_steps) g_dataloader = GraphDataLoader( g_iter, batch_size=batch_size, collate_fn=lambda x: x[0] @@ -173,9 +167,11 @@ def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, devic ) return model, g_dataloader - + def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_size:int, num_steps:int, device) -> Plottable: - _, torch, nn, _, _, _, _, trange = lazy_embed_import() + torch = deps.torch + nn = deps.torch.nn + trange = deps.tqdm.trange log('Training embedding') model, g_dataloader = res._init_model(res, batch_size, sample_size, num_steps, device) if hasattr(res, "_embed_model") and not res._build_new_embedding_model: @@ -219,7 +215,7 @@ def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_siz @property def _gcn_node_embeddings(self): - _, torch, _, _, _, _, _, _ = lazy_embed_import() + torch = deps.torch g_dgl = self._kg_dgl.to(self._device) em = self._embed_model(g_dgl).detach() torch.cuda.empty_cache() @@ -288,12 +284,12 @@ def embed( """ # this is temporary, will be fixed in future releases try: - if isinstance(self._nodes, cudf.DataFrame): + if 'cudf' in str(getmodule(self._nodes)): self._nodes = self._nodes.to_pandas() except: pass try: - if isinstance(self._edges, cudf.DataFrame): + if 'cudf' in str(getmodule(self._edges)): self._edges = self._edges.to_pandas() except: pass @@ -423,7 +419,7 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(source, cudf.DataFrame): + if 'cudf' in str(getmodule(source)): source = source.to_pandas() # type: ignore except: pass @@ -435,7 +431,7 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(relation, cudf.DataFrame): + if 'cudf' in str(getmodule(relation)): relation = relation.to_pandas() # type: ignore except: pass @@ -447,7 +443,8 @@ def predict_links( else: # this is temporary, will be removed after gpu feature utils try: - if isinstance(destination, cudf.DataFrame): + # if isinstance(destination, cudf.DataFrame): + if 'cudf' in str(getmodule(destination)): destination = destination.to_pandas() # type: ignore except: pass @@ -527,7 +524,7 @@ def fetch_triplets_for_inference(x_r): def _score(self, triplets: Union[np.ndarray, TT]) -> TT: # type: ignore - _, torch, _, _, _, _, _, _ = lazy_embed_import() + torch = deps.torch emb = self._kg_embeddings.clone().detach() if not isinstance(triplets, torch.Tensor): triplets = torch.tensor(triplets) @@ -558,7 +555,12 @@ def __len__(self) -> int: return self.num_steps def __getitem__(self, i:int): - _, torch, nn, dgl, GraphDataLoader, _, F, _ = lazy_embed_import() + torch = deps.torch + from torch import nn + from torch.nn import functional as F + dgl = deps.dgl + + from dgl.dataloading import GraphDataLoader eids = torch.from_numpy(np.random.choice(self.eids, self.sample_size)) src, dst = self.g.find_edges(eids) @@ -580,7 +582,7 @@ def __getitem__(self, i:int): @staticmethod def _sample_neg(triplets:np.ndarray, num_nodes:int) -> Tuple[TT, TT]: # type: ignore - _, torch, _, _, _, _, _, _ = lazy_embed_import() + torch = deps.torch triplets = torch.tensor(triplets) h, r, t = triplets.T h_o_t = torch.randint(high=2, size=h.size()) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 21dd56d8e8..e32692e77b 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -22,16 +22,15 @@ from graphistry.config import config as graphistry_config from graphistry.features import ScalerType from graphistry.utils.lazy_import import ( - lazy_sentence_transformers_import, - lazy_import_has_min_dependancy, - lazy_dirty_cat_import, - assert_imported_text, - assert_imported + assert_imported, + make_safe_gpu_dataframes ) from . import constants as config +from .constants import DIRTY_CAT from .PlotterBase import WeakValueDictionary, Plottable from .util import setup_logger, check_set_memoize from .ai_utils import infer_graph, infer_self_graph +from .utils.dep_manager import deps # add this inside classes and have a method that can set log level logger = setup_logger(__name__) @@ -48,14 +47,12 @@ SentenceTransformer = Any # type:ignore try: from dirty_cat import ( - SuperVectorizer, + TableVectorizer, GapEncoder, - SimilarityEncoder, - ) + ) # type: ignore except: - SuperVectorizer = Any + TableVectorizer = Any GapEncoder = Any - SimilarityEncoder = Any try: from sklearn.preprocessing import FunctionTransformer from sklearn.base import BaseEstimator, TransformerMixin @@ -63,18 +60,20 @@ FunctionTransformer = Any BaseEstimator = object TransformerMixin = object + try: + from cuml.preprocessing import FunctionTransformer + except: + FunctionTransformer = Any else: MIXIN_BASE = object Pipeline = Any SentenceTransformer = Any - SuperVectorizer = Any + TableVectorizer = Any GapEncoder = Any - SimilarityEncoder = Any FunctionTransformer = Any BaseEstimator = Any TransformerMixin = Any - def is_cudf_df(df: Any) -> bool: mod_str = str(getmodule(df)) return 'cudf' in mod_str and 'dataframe' in mod_str @@ -83,6 +82,16 @@ def is_cudf_s(s: Any) -> bool: mod_str = str(getmodule(s)) return 'cudf' in mod_str and 'series' in mod_str +def assert_imported_engine(feature_engine): + if None not in [deps.scipy, deps.sklearn, deps.dirty_cat]: # and feature_engine == DIRTY_CAT: + logger.debug(f"SCIPY VERSION: {deps.scipy.__version__}") + logger.debug(f"SKLEARN VERSION: {deps.sklearn.__version__}") + logger.debug(f"DIRTY_CAT VERSION: {deps.dirty_cat.__version__}") + elif None in [deps.scipy, deps.sklearn, deps.dirty_cat]: # and feature_engine == DIRTY_CAT: + logger.error( # noqa + "dirty_cat not found for featurizing" # noqa + ) + # ############################################################################ # @@ -115,17 +124,15 @@ def resolve_feature_engine( feature_engine: FeatureEngine, ) -> FeatureEngineConcrete: # noqa - if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: + if feature_engine in ["none", "pandas", DIRTY_CAT, "torch"]: return feature_engine # type: ignore - if feature_engine == "auto": - has_dependancy_text_, _, _ = lazy_sentence_transformers_import() - if has_dependancy_text_: - return "torch" - has_min_dependancy_, _ = lazy_import_has_min_dependancy() - if has_min_dependancy_: + if deps.dirty_cat and deps.scipy and deps.sklearn: return "dirty_cat" - return "pandas" + elif deps.sentence_transformers: + return "torch" + else: + return "pandas" raise ValueError( # noqa f'feature_engine expected to be "none", ' @@ -227,19 +234,20 @@ def features_without_target( :param y: target DataFrame :return: DataFrames of model and target """ + cudf = deps.cudf if y is None: return df remove_cols = [] if y is None: pass - elif isinstance(y, pd.DataFrame): + elif isinstance(y, pd.DataFrame) or (cudf is not None and isinstance(y, cudf.DataFrame)): yc = y.columns xc = df.columns for c in yc: if c in xc: remove_cols.append(c) + elif is_cudf_df(y): - import cudf assert isinstance(y, cudf.DataFrame) yc = y.columns xc = df.columns @@ -250,7 +258,6 @@ def features_without_target( if y.name and (y.name in df.columns): remove_cols = [y.name] elif is_cudf_s(y): - import cudf assert isinstance(y, cudf.Series) if y.name and (y.name in df.columns): remove_cols = [y.name] @@ -302,12 +309,11 @@ def remove_internal_namespace_if_present(df: pd.DataFrame): config.IMPLICIT_NODE_ID, "index", # in umap, we add as reindex ] - if (len(df.columns) <= 2): df = df.rename(columns={c: c + '_1' for c in df.columns if c in reserved_namespace}) # if (isinstance(df.columns.to_list()[0],int)): - # int_namespace = pd.to_numeric(df.columns, errors = 'ignore').dropna().to_list() # type: ignore - # df = df.rename(columns={c: str(c) + '_1' for c in df.columns if c in int_namespace}) + # int_namespace = pd.to_numeric(df.columns, errors = 'ignore').dropna().to_list() # type: ignore + # df = df.rename(columns={c: str(c) + '_1' for c in df.columns if c in int_namespace}) else: df = df.drop(columns=reserved_namespace, errors="ignore") # type: ignore return df @@ -358,7 +364,19 @@ def set_to_numeric(df: pd.DataFrame, cols: List, fill_value: float = 0.0): def set_to_datetime(df: pd.DataFrame, cols: List, new_col: str): # eg df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']]) - df[new_col] = pd.to_datetime(df[cols], errors="coerce").fillna(0) + X_type = str(getmodule(df)) + if 'cudf' not in X_type: + df[new_col] = pd.to_datetime(df[cols], errors="coerce").fillna(0) + else: + cudf = deps.cudf + assert cudf is not None + for col in df.columns: + try: + df[col] = cudf.to_datetime( + df[col], errors="raise", infer_datetime_format=True + ) + except: + pass def set_to_bool(df: pd.DataFrame, col: str, value: Any): @@ -533,14 +551,19 @@ def transform(self, ids) -> pd.DataFrame: mask = self.index.isin(ids) index = self.index[mask] # type: ignore res = self.vectors[mask] - res = pd.DataFrame(res, index=index, columns=self.columns) # type: ignore + try: + res = pd.DataFrame(res, index=index, columns=self.columns) # type: ignore + except TypeError: + cudf = deps.cudf + res = cudf.DataFrame(res) # type: ignore + res.set_index(index,inplace=True) # type: ignore + res.columns = self.columns # type: ignore return res # type: ignore def fit_transform(self, n_dim: int): self.fit(n_dim) return self.transform(self.index) - def identity(x): return x @@ -722,7 +745,8 @@ def encode_textual( max_df: float = 0.2, min_df: int = 3, ) -> Tuple[pd.DataFrame, List, Any]: - _, _, SentenceTransformer = lazy_sentence_transformers_import() + + SentenceTransformer = deps.sentence_transformers.SentenceTransformer t = time() text_cols = get_textual_columns( @@ -879,8 +903,8 @@ def process_dirty_dataframes( ) -> Tuple[ pd.DataFrame, Optional[pd.DataFrame], - Union[SuperVectorizer, FunctionTransformer], - Union[SuperVectorizer, FunctionTransformer], + Union[TableVectorizer, FunctionTransformer], + Union[TableVectorizer, FunctionTransformer], ]: """ Dirty_Cat encoder for record level data. Will automatically turn @@ -895,11 +919,11 @@ def process_dirty_dataframes( :param n_topics: number of topics for GapEncoder, default 42 :param similarity: one of 'ngram', 'levenshtein-ratio', 'jaro', or'jaro-winkler'}) – The type of pairwise string similarity - to use. If None or False, uses a SuperVectorizer + to use. If None or False, uses a TableVectorizer :return: Encoded data matrix and target (if not None), the data encoder, and the label encoder. """ - has_dirty_cat, _, dirty_cat = lazy_dirty_cat_import() + has_dirty_cat = deps.dirty_cat if has_dirty_cat: from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder from sklearn.preprocessing import FunctionTransformer @@ -937,7 +961,8 @@ def process_dirty_dataframes( features_transformed = data_encoder.get_feature_names_out() all_transformers = data_encoder.transformers - logger.debug(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}") + + logger.info(f"-Shape of [[featurize fit]] data {X_enc.shape}") logger.debug(f"-Transformers: \n{all_transformers}\n") logger.debug( f"-Transformed Columns: \n{features_transformed[:20]}...\n" @@ -968,7 +993,7 @@ def process_dirty_dataframes( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and has_dirty_cat # noqa: E126,W503 + and deps.dirty_cat # noqa: E126,W503 ): t2 = time() logger.debug("-Fitting Targets --\n%s", y.columns) @@ -991,7 +1016,7 @@ def process_dirty_dataframes( with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) - if isinstance(label_encoder, SuperVectorizer) or isinstance( + if 'vectorizer' in str(getmodule(label_encoder)) or isinstance( label_encoder, FunctionTransformer ): labels_transformed = label_encoder.get_feature_names_out() @@ -1009,14 +1034,14 @@ def process_dirty_dataframes( # logger.debug(f"-Target Transformers used: # {label_encoder.transformers}\n") logger.debug( - "--Fitting SuperVectorizer on TARGET took" + "--Fitting TableVectorizer on TARGET took" f" {(time() - t2) / 60:.2f} minutes\n" ) elif ( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and not has_dirty_cat # noqa: E126,W503 + and not deps.dirty_cat # noqa: E126,W503 ): logger.warning("-*-*- y is not numeric and no dirty_cat, dropping non-numeric") y2 = y.select_dtypes(include=[np.number]) # type: ignore @@ -1061,8 +1086,8 @@ def process_nodes_dataframes( Any, pd.DataFrame, Any, - SuperVectorizer, - SuperVectorizer, + TableVectorizer, + TableVectorizer, Optional[Pipeline], Optional[Pipeline], Any, @@ -1139,8 +1164,7 @@ def process_nodes_dataframes( text_cols: List[str] = [] text_model: Any = None text_enc = pd.DataFrame([]) - has_deps_text, import_text_exn, _ = lazy_sentence_transformers_import() - if has_deps_text and (feature_engine in ["torch", "auto"]): + if deps.sentence_transformers and (feature_engine in ["torch", "auto"]): text_enc, text_cols, text_model = encode_textual( df, min_words=min_words, @@ -1152,8 +1176,8 @@ def process_nodes_dataframes( ) else: logger.debug( - "! Skipping encoding any textual features" - f"since dependency {import_text_exn} is not met" + "! Skipping encoding any textual features," + "since dependency sentence_transformer is not met" ) other_df = df.drop(columns=text_cols, errors="ignore") # type: ignore @@ -1361,7 +1385,7 @@ def process_edge_dataframes( :return: Encoded data matrix and target (if not None), the data encoders, and the label encoder. """ - lazy_import_has_min_dependancy() + # scipy = deps.scipy from sklearn.preprocessing import ( MultiLabelBinarizer, ) @@ -1372,6 +1396,7 @@ def process_edge_dataframes( MultiLabelBinarizer() ) # create new one so we can use encode_edges later in # transform with fit=False + T, mlb_pairwise_edge_encoder = encode_edges( edf, src, dst, mlb_pairwise_edge_encoder, fit=True ) @@ -1514,7 +1539,8 @@ def transform_text( text_cols: Union[List, str], ) -> pd.DataFrame: from sklearn.pipeline import Pipeline - _, _, SentenceTransformer = lazy_sentence_transformers_import() + + SentenceTransformer = deps.sentence_transformers.SentenceTransformer logger.debug("Transforming text using:") if isinstance(text_model, Pipeline): @@ -1545,7 +1571,7 @@ def transform_text( def transform_dirty( df: pd.DataFrame, - data_encoder: Union[SuperVectorizer, FunctionTransformer], # type: ignore + data_encoder: Union[TableVectorizer, FunctionTransformer], # type: ignore name: str = "", ) -> pd.DataFrame: # from sklearn.preprocessing import MultiLabelBinarizer @@ -1788,7 +1814,7 @@ def scale(self, X=None, y=None, return_pipeline=False, *args, **kwargs): **Example:** :: - from graphisty.features import SCALERS, SCALER_OPTIONS + from graphistry.features import SCALERS, SCALER_OPTIONS print(SCALERS) g = graphistry.nodes(df) # set a scaling strategy for features and targets -- umap uses those and produces different results depending. @@ -1956,6 +1982,9 @@ def _featurize_nodes( X_resolved = resolve_X(ndf, X) y_resolved = resolve_y(ndf, y) + assert_imported_engine(feature_engine) + X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) + from .features import ModelDict fkwargs = ModelDict("Featurize Params", @@ -2076,6 +2105,7 @@ def _featurize_edges( X_resolved = X_resolved.assign( **{res._destination: res._edges[res._destination]} ) + X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) # now that everything is set fkwargs = dict( @@ -2203,9 +2233,9 @@ def transform(self, df: pd.DataFrame, """ # This is temporary until cucat release - if 'cudf' in str(getmodule(df)): + if 'cudf.core.dataframe' in str(getmodule(df)): df = df.to_pandas() # type: ignore - if (y is not None) and ('cudf' in str(getmodule(y))): + if (y is not None) and ('cudf.core.dataframe' in str(getmodule(y))): y = y.to_pandas() # type: ignore if kind == "nodes": @@ -2487,7 +2517,10 @@ def featurize( default True. :return: graphistry instance with new attributes set by the featurization process. """ - assert_imported() + feature_engine = resolve_feature_engine(feature_engine) + + assert_imported_engine(feature_engine) + if inplace: res = self else: diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index b9bcc77844..b977732890 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -4,13 +4,12 @@ import graphistry from graphistry.constants import DBSCAN from graphistry.util import ModelDict -from graphistry.utils.lazy_import import ( - lazy_dbscan_import, - lazy_umap_import -) + +from graphistry.utils.lazy_import import lazy_dbscan_import +from graphistry.utils.dep_manager import deps has_dbscan, _, has_gpu_dbscan, _ = lazy_dbscan_import() -has_umap, _, _ = lazy_umap_import() +has_umap = deps.umap ndf = edf = pd.DataFrame({'src': [1, 2, 1, 4], 'dst': [4, 5, 6, 1], 'label': ['a', 'b', 'b', 'c']}) diff --git a/graphistry/tests/test_dgl_utils.py b/graphistry/tests/test_dgl_utils.py index cf8f24bd91..b92ef1382a 100644 --- a/graphistry/tests/test_dgl_utils.py +++ b/graphistry/tests/test_dgl_utils.py @@ -3,10 +3,11 @@ import graphistry import pandas as pd from graphistry.util import setup_logger -from graphistry.utils.lazy_import import lazy_dgl_import +from graphistry.utils.dep_manager import DepManager -has_dgl, _, dgl = lazy_dgl_import() +deps = DepManager() +has_dgl = deps.dgl if has_dgl: import torch diff --git a/graphistry/tests/test_embed_utils.py b/graphistry/tests/test_embed_utils.py index 5bb92a49aa..592a9aaf46 100644 --- a/graphistry/tests/test_embed_utils.py +++ b/graphistry/tests/test_embed_utils.py @@ -5,14 +5,39 @@ import graphistry import numpy as np -from graphistry.embed_utils import check_cudf -from graphistry.utils.lazy_import import lazy_embed_import +# import tqdm as tqdm_ +from graphistry.utils.dep_manager import deps +from graphistry import networks + import logging logger = logging.getLogger(__name__) -dep_flag, _, _, _, _, _, _, _ = lazy_embed_import() -has_cudf, cudf = check_cudf() +# not previously imported but needed to check if we can run tests via dep_flag +torch_ = deps.torch +nn_ = deps.torch_nn +dgl_ = deps.dgl +tqdm_ = deps.tqdm +if dgl_: + from dgl.dataloading import GraphDataLoader +if torch_: + from torch import nn + from torch.nn import functional as F_ + +HeteroEmbed_ = deps.graphistry.networks.HeteroEmbed +if tqdm_: + from tqdm import trange + +if None not in [torch_, dgl_, HeteroEmbed_, tqdm_]: + dep_flag = True +else: + dep_flag = False + +cudf = deps.cudf +if cudf: + has_cudf = True +else: + has_cudf = False # enable tests if has cudf and env didn't explicitly disable is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fd30c30c8a..38d0a41ee4 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -1,10 +1,12 @@ # python -m unittest +import os import datetime as dt import graphistry import logging import numpy as np import pandas as pd from typing import Any +from inspect import getmodule import pytest import unittest @@ -19,15 +21,22 @@ from graphistry.features import topic_model, ngrams_model from graphistry.constants import SCALERS -from graphistry.utils.lazy_import import ( - lazy_import_has_min_dependancy, - lazy_sentence_transformers_import -) + +from graphistry.utils.dep_manager import deps np.random.seed(137) -has_min_dependancy, _ = lazy_import_has_min_dependancy() -has_min_dependancy_text, _, _ = lazy_sentence_transformers_import() +cudf = deps.cudf +cuml = deps.cuml +dirty_cat = deps.dirty_cat +scipy = deps.scipy +sklearn = deps.sklearn +has_min_dependancy = None +has_cuda_dependancy = None +if None not in [dirty_cat, scipy, sklearn]: + has_min_dependancy = True +has_min_dependancy_text = deps.sentence_transformers + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -36,7 +45,7 @@ model_avg_name = ( "/models/average_word_embeddings_komninos" # 250mb, fastest vectorizer in transformer models - #"/models/paraphrase-albert-small-v2" # 40mb + # "/models/paraphrase-albert-small-v2" # 40mb #"/models/paraphrase-MiniLM-L3-v2" # 60mb ) @@ -161,7 +170,7 @@ def allclose_stats(X, x, tol, name): if not np.allclose(X.mean(), x.mean(), tol): print(f'{name}.means() are not aligned at {tol} tolerance...!') - + if not np.allclose(X, x, tol): print(f'{name}s are not aligned at {tol} tolerance...!') @@ -209,7 +218,7 @@ def test_get_col_matrix(self): # test feature methods # ngrams assert (self.g2.get_matrix().columns == self.g2._node_features.columns).all() - assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns) + # assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns) # topic assert all(self.g3.get_matrix().columns == self.g3._node_features.columns) @@ -272,13 +281,13 @@ def cases_tests(self, x, y, data_encoder, target_encoder, name, value): ) self.assertIsInstance( data_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, - f"Data Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", + dirty_cat._table_vectorizer.TableVectorizer, + f"Data Encoder is not a dirty_cat._table_vectorizer.TableVectorizer instance for {name} {value}", ) self.assertIsInstance( target_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, - f"Data Target Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", + dirty_cat._table_vectorizer.TableVectorizer, + f"Data Target Encoder is not a dirty_cat._table_vectorizer.TableVectorizer instance for {name} {value}", ) @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") diff --git a/graphistry/tests/test_text_utils.py b/graphistry/tests/test_text_utils.py index 5b930f553f..687df29aec 100644 --- a/graphistry/tests/test_text_utils.py +++ b/graphistry/tests/test_text_utils.py @@ -6,18 +6,15 @@ import logging import numpy as np import pandas as pd -from graphistry.feature_utils import remove_internal_namespace_if_present +from graphistry.feature_utils import remove_internal_namespace_if_present, assert_imported_engine as assert_imported_feature_utils from graphistry.tests.test_feature_utils import ( ndf_reddit, edge_df, ) -from graphistry.utils.lazy_import import ( - lazy_umap_import, - lazy_import_has_min_dependancy -) -has_dependancy, _ = lazy_import_has_min_dependancy() -has_umap, _, _ = lazy_umap_import() +from graphistry.utils.dep_manager import DepManager +deps = DepManager() +has_umap = deps.umap logger = logging.getLogger(__name__) diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index 82e4e28465..0993e5c922 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -1,17 +1,21 @@ from time import time from typing import Any +from xml.sax.handler import feature_external_ges import pytest import unittest import warnings import gc import graphistry + import os import logging import numpy as np import pandas as pd from graphistry.config import config +from graphistry import Plottable from graphistry.feature_utils import remove_internal_namespace_if_present + from graphistry.tests.test_feature_utils import ( ndf_reddit, text_cols_reddit, @@ -23,24 +27,20 @@ edge_df2, edge2_target_df, model_avg_name, - lazy_import_has_min_dependancy, check_allclose_fit_transform_on_same_data, ) -from graphistry.utils.lazy_import import ( - lazy_cudf_import, - lazy_cuml_import, - lazy_umap_import, -) -from graphistry.util import cache_coercion_helper -has_dependancy, _ = lazy_import_has_min_dependancy() -has_cuml, _, _ = lazy_cuml_import() -has_umap, _, umap = lazy_umap_import() -has_cudf, _, cudf = lazy_cudf_import() +from graphistry.utils.dep_manager import deps -# print('has_dependancy', has_dependancy) -# print('has_cuml', has_cuml) -# print('has_umap', has_umap) +has_cuml = deps.cuml +cuml = deps.cuml +has_umap = deps.umap +umap = deps.umap +has_cudf = deps.cudf +cudf = deps.cudf +dirty_cat = deps.dirty_cat +if deps.sklearn and deps.scipy: + has_dependancy = True logger = logging.getLogger(__name__) logging.getLogger("graphistry.umap_utils").setLevel(logging.DEBUG) @@ -48,7 +48,7 @@ warnings.filterwarnings("ignore") # enable tests if has cudf and env didn't explicitly disable -is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" +is_test_cudf = cudf and os.environ["TEST_CUDF"] != "0" triangleEdges = pd.DataFrame( { @@ -78,15 +78,18 @@ node_target = triangleNodes[["y"]] def _eq(df1, df2): - try: - df1 = df1.to_pandas() - except: - pass - try: - df2 = df2.to_pandas() - except: - pass - return df1 == df2 + def tr(df): + try: + df = (df.to_numpy()) + except: + pass + try: + df = np.sort(df) + except: + pass + return df + + return tr(df1) == tr(df2) @pytest.fixture(scope="module") @@ -734,10 +737,10 @@ def test_umap_kwargs_threaded(self, reddit_ndf: pd.DataFrame): reason="requires cuml feature dependencies", ) class TestCUMLMethods(TestUMAPMethods): - - def setup_method(self, method: Any) -> None: - cache_coercion_helper.cache_clear() - gc.collect() + # def setup_method(self, method: Any) -> None: + # dgl = deps.dgl + # not dgl.cache_clear() + # gc.collect() @classmethod def setup_class(cls: Any) -> None: diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index 5b963b1f1d..229a95d15b 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -6,21 +6,24 @@ import numpy as np import pandas as pd -from graphistry.utils.lazy_import import ( - lazy_cudf_import, - lazy_umap_import, - lazy_cuml_import, -) from . import constants as config from .constants import CUML, UMAP_LEARN from .feature_utils import (FeatureMixin, Literal, XSymbolic, YSymbolic, resolve_feature_engine) from .PlotterBase import Plottable, WeakValueDictionary -from .util import check_set_memoize, setup_logger +from .utils.dep_manager import deps + +from graphistry.utils.lazy_import import ( + make_safe_gpu_dataframes +) + +from .util import check_set_memoize, setup_logger + logger = setup_logger(__name__) + if TYPE_CHECKING: MIXIN_BASE = FeatureMixin else: @@ -31,30 +34,31 @@ ############################################################################### - def assert_imported(): - has_dependancy_, import_exn, _ = lazy_umap_import() - if not has_dependancy_: + umap_ = deps.umap + if not umap_: + logger.error("UMAP not found, trying running " "`pip install graphistry[ai]`") - raise import_exn + # raise import_exn def assert_imported_cuml(): - has_cuml_dependancy_, import_cuml_exn, _ = lazy_cuml_import() - if not has_cuml_dependancy_: + cuml_ = deps.cuml + if not cuml_: + logger.warning("cuML not found, trying running " "`pip install cuml`") - raise import_cuml_exn + # raise import_cuml_exn def is_legacy_cuml(): try: - import cuml - - vs = cuml.__version__.split(".") - if (vs[0] in ["0", "21"]) or (vs[0] == "22" and float(vs[1]) < 6): - return True - else: - return False + cuml = deps.cuml + if cuml: # noqa + vs = cuml.__version__.split(".") + if (vs[0] in ["0", "21"]) or (vs[0] == "22" and float(vs[1]) < 6): + return True + else: + return False except ModuleNotFoundError: return False @@ -69,11 +73,11 @@ def resolve_umap_engine( if engine in [CUML, UMAP_LEARN]: return engine # type: ignore if engine in ["auto"]: - has_cuml_dependancy_, _, _ = lazy_cuml_import() - if has_cuml_dependancy_: + cuml_ = deps.cuml + if cuml_: return 'cuml' - has_umap_dependancy_, _, _ = lazy_umap_import() - if has_umap_dependancy_: + umap_ = deps.umap + if umap_: return 'umap_learn' raise ValueError( # noqa @@ -81,35 +85,6 @@ def resolve_umap_engine( '"umap_learn", or "cuml" ' f"but received: {engine} :: {type(engine)}" ) - - -def make_safe_gpu_dataframes(X, y, engine): - - def safe_cudf(X, y): - # remove duplicate columns - if len(X.columns) != len(set(X.columns)): - X = X.loc[:, ~X.columns.duplicated()] - try: - y = y.loc[:, ~y.columns.duplicated()] - except: - pass - new_kwargs = {} - kwargs = {'X': X, 'y': y} - for key, value in kwargs.items(): - if isinstance(value, cudf.DataFrame) and engine in ["pandas", "umap_learn", "dirty_cat"]: - new_kwargs[key] = value.to_pandas() - elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: - new_kwargs[key] = cudf.from_pandas(value) - else: - new_kwargs[key] = value - return new_kwargs['X'], new_kwargs['y'] - - has_cudf_dependancy_, _, cudf = lazy_cudf_import() - if has_cudf_dependancy_: - return safe_cudf(X, y) - else: - return X, y - ############################################################################### # ############################################################################# @@ -220,9 +195,9 @@ def umap_lazy_init( engine_resolved = resolve_umap_engine(engine) # FIXME remove as set_new_kwargs will always replace? if engine_resolved == UMAP_LEARN: - _, _, umap_engine = lazy_umap_import() + umap_engine = deps.umap elif engine_resolved == CUML: - _, _, umap_engine = lazy_cuml_import() + umap_engine = deps.cuml else: raise ValueError( "No umap engine, ensure 'auto', 'umap_learn', or 'cuml', and the library is installed" @@ -357,6 +332,7 @@ def transform_umap(self, df: pd.DataFrame, merge_policy: bool = False, sample: Optional[int] = None, return_graph: bool = True, + engine: UMAPEngine = 'auto', fit_umap_embedding: bool = True, umap_transform_kwargs: Dict[str, Any] = {} ) -> Union[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame], Plottable]: @@ -375,14 +351,17 @@ def transform_umap(self, df: pd.DataFrame, return_graph: Whether to return a graph or just the embeddings fit_umap_embedding: Whether to infer graph from the UMAP embedding on the new data, default True """ - df, y = make_safe_gpu_dataframes(df, y, 'pandas') + + df, y = make_safe_gpu_dataframes(df, y, engine) X, y_ = self.transform(df, y, kind=kind, return_graph=False) - X, y_ = make_safe_gpu_dataframes(X, y_, self.engine) # type: ignore - emb = self._umap.transform(X, **umap_transform_kwargs) # type: ignore + try: # cuml has reproducibility issues with fit().transform() vs .fit_transform() + emb = self._umap.transform(X, **umap_transform_kwargs) # type: ignore + except: + emb = self._umap.fit_transform(X) # type: ignore + emb = self._bundle_embedding(emb, index=df.index) if return_graph and kind not in ["edges"]: emb, _ = make_safe_gpu_dataframes(emb, None, 'pandas') # for now so we don't have to touch infer_edges, force to pandas - X, y_ = make_safe_gpu_dataframes(X, y_, 'pandas') g = self._infer_edges(emb, X, y_, df, infer_on_umap_embedding=fit_umap_embedding, merge_policy=merge_policy, eps=min_dist, sample=sample, n_neighbors=n_neighbors) @@ -391,9 +370,9 @@ def transform_umap(self, df: pd.DataFrame, def _bundle_embedding(self, emb, index): # Converts Embedding into dataframe and takes care if emb.dim > 2 - if emb.shape[1] == 2 and 'cudf.core.dataframe' not in str(getmodule(emb)) and not hasattr(emb, 'device'): + if emb.shape[1] == 2 and 'cudf' not in str(getmodule(emb)) and not hasattr(emb, 'device'): emb = pd.DataFrame(emb, columns=[config.X, config.Y], index=index) - elif emb.shape[1] == 2 and 'cudf.core.dataframe' in str(getmodule(emb)): + elif emb.shape[1] == 2 and 'cudf' in str(getmodule(emb)): emb.rename(columns={0: config.X, 1: config.Y}, inplace=True) elif emb.shape[1] == 2 and hasattr(emb, 'device'): import cudf @@ -402,9 +381,15 @@ def _bundle_embedding(self, emb, index): columns = [config.X, config.Y] + [ f"umap_{k}" for k in range(2, emb.shape[1]) ] - if 'cudf.core.dataframe' not in str(getmodule(emb)): + if 'cudf' not in str(getmodule(emb)) and 'cupy' not in str(getmodule(emb)): emb = pd.DataFrame(emb, columns=columns, index=index) - elif 'cudf.core.dataframe' in str(getmodule(emb)): + elif 'ndarray' in str(getmodule(emb)) or 'None' in str(getmodule(emb)): + if self.has_cudf: + emb = cudf.DataFrame(emb) + else: + emb = cudf.DataFrame(emb) + emb.columns = columns + else: emb.columns = columns return emb @@ -610,14 +595,15 @@ def umap( logger.debug("umap_kwargs: %s", umap_kwargs_combined) # temporary until we have full cudf support in feature_utils.py - has_cudf, _, cudf = lazy_cudf_import() + cudf = deps.cudf if inplace: res = self else: res = self.bind() - if has_cudf: + if cudf is not None: + flag_nodes_cudf = isinstance(self._nodes, cudf.DataFrame) flag_edges_cudf = isinstance(self._edges, cudf.DataFrame) @@ -685,6 +671,7 @@ def umap( logger.debug("data is type :: %s", (type(X_))) if isinstance(X_, pd.DataFrame): index_to_nodes_dict = dict(zip(range(len(nodes)), nodes)) + elif 'cudf.core.dataframe' in str(getmodule(X_)): assert isinstance(X_, cudf.DataFrame) logger.debug('nodes type: %s', type(nodes)) @@ -797,10 +784,11 @@ def _bind_xy_from_umap( if isinstance(df, type(emb)): df[x_name] = emb.values.T[0] df[y_name] = emb.values.T[1] - elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)): + elif 'pandas' in str(getmodule(emb)) and 'cudf' in str(getmodule(emb)): df[x_name] = emb.to_numpy().T[0] df[y_name] = emb.to_numpy().T[1] + res = res.nodes(df) if kind == "nodes" else res.edges(df) if encode_weight and kind == "nodes": diff --git a/graphistry/utils/dep_manager.py b/graphistry/utils/dep_manager.py new file mode 100644 index 0000000000..6757892757 --- /dev/null +++ b/graphistry/utils/dep_manager.py @@ -0,0 +1,59 @@ +from importlib import import_module, __import__ +from graphistry.util import setup_logger + +logger = setup_logger(__name__) + +class DepManager: + """ + This class is a helper to manage dependencies + It allows for dynamic imports and attribute access + It is used in the Graphistry Python client to manage optional dependencies + + : param pkgs: dict, cache to store imported packages, prevent redudant imports + : returns: None + + **Example** + :: + + deps = DepManager() + has_dbscan = deps.dbscan + has_umap = deps.umap + """ + def __init__(self): + self.pkgs = {} # Cache dict to store imported packages, prevent redundant imports + + def __getattr__(self, pkg:str): + self._add_deps(pkg) # Import package + try: + return self.pkgs[pkg] # Return package + except KeyError: + return None + + def _add_deps(self, pkg:str): + try: + # If cudf is being imported, also import cupy since colab installed cudf on CPUs + if pkg == 'cudf': + cupy_val = import_module('cupy') + self.pkgs['cupy'] = cupy_val + setattr(self, 'cupy', cupy_val) + pkg_val = import_module(pkg) + self.pkgs[pkg] = pkg_val # store in cache dict + setattr(self, pkg, pkg_val) # add pkg to deps instance + except ModuleNotFoundError: + logger.debug(f"{pkg} not installed") + except ImportError: + logger.debug(f"{pkg} installed but misconfigured") + + def import_from(self, pkg:str, name:str): + try: + module = __import__(pkg, fromlist=[name]) # like _add_deps, but uses __import__ to get top-level pkg/ modules + self.pkgs[name] = module + except ModuleNotFoundError: + logger.debug(f"{pkg} not installed") + except ImportError: + logger.error(f"{pkg} installed but misconfigured") + except Exception as e: + logger.warn("Unexpected exn during lazy import", exc_info=e) + + +deps = DepManager() diff --git a/graphistry/utils/lazy_import.py b/graphistry/utils/lazy_import.py index f7de35bdbf..354b80b221 100644 --- a/graphistry/utils/lazy_import.py +++ b/graphistry/utils/lazy_import.py @@ -5,18 +5,6 @@ #TODO use new importer when it lands (this is copied from umap_utils) -def lazy_cudf_import(): - try: - warnings.filterwarnings("ignore") - import cudf # type: ignore - - return True, "ok", cudf - except ModuleNotFoundError as e: - return False, e, None - except Exception as e: - logger.warn("Unexpected exn during lazy import", exc_info=e) - return False, e, None - def lazy_cuml_import(): try: warnings.filterwarnings("ignore") @@ -177,3 +165,32 @@ def assert_imported(): "`pip install graphistry[ai]`" # noqa ) raise import_min_exn + + +def make_safe_gpu_dataframes(X, y, engine): + from .dep_manager import deps + + def safe_cudf(X, y): + + cudf = deps.cudf + pd = deps.pandas + # remove duplicate columns + if len(X.columns) != len(set(X.columns)): + X = X.loc[:, ~X.columns.duplicated()] + try: + y = y.loc[:, ~y.columns.duplicated()] + except: + pass + new_kwargs = {} + kwargs = {'X': X, 'y': y} + for key, value in kwargs.items(): + if isinstance(value, cudf.DataFrame) and engine in ["pandas", "umap_learn", "dirty_cat"]: + new_kwargs[key] = value.to_pandas() + elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: + new_kwargs[key] = cudf.from_pandas(value) + return new_kwargs['X'], new_kwargs['y'] + + if deps.cuml: + return safe_cudf(X, y) + else: + return X, y diff --git a/setup.py b/setup.py index 1adc759906..a12acb9271 100755 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ def unique_flatten_dict(d): test_workarounds = ['scikit-learn<=1.3.2'] dev_extras = { + 'docs': [ 'docutils==0.21.2', 'ipython==8.28', @@ -39,6 +40,7 @@ def unique_flatten_dict(d): 'sphinx-book-theme==1.1.3', ], 'test': ['flake8>=5.0', 'mock', 'mypy', 'pytest'] + stubs + test_workarounds, + 'testai': [ 'numba>=0.57.1' # https://github.com/numba/numba/issues/8615 ], @@ -56,10 +58,11 @@ def unique_flatten_dict(d): } base_extras_heavy = { - 'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'], + 'umap-learn': ['umap-learn', 'dirty-cat', 'scikit-learn'], } # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib'] +base_extras_heavy['cu-cat'] = ['cu-cat'] base_extras = {**base_extras_light, **base_extras_heavy}