Module markov.api.schemas.data_schema_impl

Classes

class ColComparison (confidence: float, mismatch: bool, message: str = '')

Column Comparison Details and how confident we are about this comparison

Static methods

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

var confidence : float

Set to true if there in-compatible columns are compared

var message : str
var mismatch : bool

Specific Message for comparison

Methods

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class ColStats (mean: Union[float, str] = 'na',
mode: Union[float, str] = 'na',
max_value: Union[float, str] = 'na',
min_value: Union[float, str] = 'na',
median: Union[float, str] = 'na',
kurtosis: Union[float, str] = 'na',
variance: Union[float, str] = 'na',
skewness: Union[float, str] = 'na',
quartiles: Dict = builtins.dict)

Column Statistics for a DataSet

Static methods

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

prop iqr

Returns

Range between Q3 (Quartile) and Q1 (Quartile)

var kurtosis : float | str

Variance of this dataset column if applicable

var max_value : float | str

min_value of this dataset column if applicable

var mean : float | str

Mode of this dataset column if applicable

var median : float | str

Kurtosis of this dataset column if applicable

var min_value : float | str

median of this dataset column if applicable

var mode : float | str

max_value of this dataset column if applicable

var quartiles : Dict

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

var skewness : float | str
var variance : float | str

Skewness of this dataset column if applicable

Methods

def get_quartile(self, value: float) ‑> str

Which quartile data_set belongs to.

Args

value : float
what is the quartile of this datapoint.

Returns

Get the Quartile of this point.

def std_away(self, value: float) ‑> float

How many standard deviations is the point away from mean.

Args

value : float
input value to check.

Returns

How many std away is this value from the mean.

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class ColumnInfo (col_type: VisionsBaseType,
info: Union[ColStatsTextColStats])

Type of column info to compare the column schema/values

Static methods

def create_from_json(json_str: str) ‑> ColumnInfo

Create the ColumnInfo from the serialized JSON string of this object.

Args

json_str (): Returns:

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

var col_type : visions.types.type.VisionsBaseType
var infoColStats | TextColStats

Methods

def compare_type(self,
another_col: ColumnInfo) ‑> ColComparison

Compare the given ColumnInfo with another column info.

Args

another_col : ColumnInfo
Another ColumnInfo we have to compare this given column info with.

Returns

ColComparison object that contains comparison info.

def get_json(self) ‑> str

Serialize this object to JSON

Returns

JSON Serialized string of this object

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DataSchema (num_cols: int,
col_info: List[ColumnInfo])

Place holder for DataSchema representation

Static methods

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

var col_info : List[ColumnInfo]
var num_cols : int

Column Signature of this dataset

Methods

def compare(self,
data_schema: DataSchema) ‑> DataSchemaComparisonResults

Compare with the other data_set schemas and return the mismatch based on

Args

data_schema : DataSchema
Another data_set schema to compare

Returns:

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DataSchemaComparisonResults

DataSchemaComparisonResults is the outcome of comparison of two data_set schemas

Static methods

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

prop mismatched_cols

Columns that are mismatch based on ColComparison results

Returns

List of mismatched cols and details.

Methods

def add_mismatched(self,
col_comparison: ColComparison)

Add columns that are mismatched and their details to the

Args

col_comparison : ColComparison
Col comparison result for a specific mismatched column.
def has_mismatch(self) ‑> bool

Returns

True if there is a mismatch detected in comparison

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DescribeColumnResults (column_info: List[ColumnInfo],
sampled_df: DataFrame)

DescribeColumnResults shows the column stats for the describe operation on the columns.

Static methods

def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~A
def schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]

Instance variables

var column_info : List[ColumnInfo]

if the data_set is sampled then sampled_df would be copy of the sampled dataset else it would be reference to original

var sampled_df : pandas.core.frame.DataFrame

Methods

def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class TextColStats (unique_token_count: int,
top_10_tokens: List,
avg_num_tokens: int,
min_num_tokens: int,
max_num_tokens: int,
median_num_tokens: int,
lang: str = 'not_analyzed')

Captures the column stats for Text data_set type columns

Instance variables

var avg_num_tokens : int

Min length of unique tokens in text feature column

var lang : str

URL to fetch the TextColStats for this dataSet

var max_num_tokens : int

median length of the feature

var median_num_tokens : int

Language detected for this dataset

var min_num_tokens : int

max length of unique tokens in text feature column

var top_10_tokens : List

Average length of unique tokens in text feature column

var unique_token_count : int

top 10 tokens in the Feature Column

Methods

def display_in_hub(self)
def plot(self)

Publish and display the stats.