Module markov.api.schemas.data_schema_impl
Classes
class ColComparison (confidence: float, mismatch: bool, message: str = '')
-
Column Comparison Details and how confident we are about this comparison
Static methods
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
var confidence : float
-
Set to true if there in-compatible columns are compared
var message : str
var mismatch : bool
-
Specific Message for comparison
Methods
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class ColStats (mean: Union[float, str] = 'na',
mode: Union[float, str] = 'na',
max_value: Union[float, str] = 'na',
min_value: Union[float, str] = 'na',
median: Union[float, str] = 'na',
kurtosis: Union[float, str] = 'na',
variance: Union[float, str] = 'na',
skewness: Union[float, str] = 'na',
quartiles: Dict = builtins.dict)-
Column Statistics for a DataSet
Static methods
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
prop iqr
-
Returns
Range between Q3 (Quartile) and Q1 (Quartile)
var kurtosis : float | str
-
Variance of this dataset column if applicable
var max_value : float | str
-
min_value of this dataset column if applicable
var mean : float | str
-
Mode of this dataset column if applicable
var median : float | str
-
Kurtosis of this dataset column if applicable
var min_value : float | str
-
median of this dataset column if applicable
var mode : float | str
-
max_value of this dataset column if applicable
var quartiles : Dict
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
var skewness : float | str
var variance : float | str
-
Skewness of this dataset column if applicable
Methods
def get_quartile(self, value: float) ‑> str
-
Which quartile data_set belongs to.
Args
value
:float
- what is the quartile of this datapoint.
Returns
Get the Quartile of this point.
def std_away(self, value: float) ‑> float
-
How many standard deviations is the point away from mean.
Args
value
:float
- input value to check.
Returns
How many std away is this value from the mean.
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class ColumnInfo (col_type: VisionsBaseType,
info: Union[ColStats, TextColStats])-
Type of column info to compare the column schema/values
Static methods
def create_from_json(json_str: str) ‑> ColumnInfo
-
Create the ColumnInfo from the serialized JSON string of this object.
Args
json_str (): Returns:
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
var col_type : visions.types.type.VisionsBaseType
var info : ColStats | TextColStats
Methods
def compare_type(self,
another_col: ColumnInfo) ‑> ColComparison-
Compare the given ColumnInfo with another column info.
Args
another_col
:ColumnInfo
- Another ColumnInfo we have to compare this given column info with.
Returns
ColComparison object that contains comparison info.
def get_json(self) ‑> str
-
Serialize this object to JSON
Returns
JSON Serialized string of this object
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DataSchema (num_cols: int,
col_info: List[ColumnInfo])-
Place holder for DataSchema representation
Static methods
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
var col_info : List[ColumnInfo]
var num_cols : int
-
Column Signature of this dataset
Methods
def compare(self,
data_schema: DataSchema) ‑> DataSchemaComparisonResults-
Compare with the other data_set schemas and return the mismatch based on
Args
data_schema
:DataSchema
- Another data_set schema to compare
Returns:
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DataSchemaComparisonResults
-
DataSchemaComparisonResults is the outcome of comparison of two data_set schemas
Static methods
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
prop mismatched_cols
-
Columns that are mismatch based on ColComparison results
Returns
List of mismatched cols and details.
Methods
def add_mismatched(self,
col_comparison: ColComparison)-
Add columns that are mismatched and their details to the
Args
col_comparison
:ColComparison
- Col comparison result for a specific mismatched column.
def has_mismatch(self) ‑> bool
-
Returns
True if there is a mismatch detected in comparison
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class DescribeColumnResults (column_info: List[ColumnInfo],
sampled_df: DataFrame)-
DescribeColumnResults shows the column stats for the describe operation on the columns.
Static methods
def from_dict(kvs: dict | list | str | int | float | bool | None, *, infer_missing=False) ‑> ~A
def from_json(s: str | bytes | bytearray,
*,
parse_float=None,
parse_int=None,
parse_constant=None,
infer_missing=False,
**kw) ‑> ~Adef schema(*,
infer_missing: bool = False,
only=None,
exclude=(),
many: bool = False,
context=None,
load_only=(),
dump_only=(),
partial: bool = False,
unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
var column_info : List[ColumnInfo]
-
if the data_set is sampled then sampled_df would be copy of the sampled dataset else it would be reference to original
var sampled_df : pandas.core.frame.DataFrame
Methods
def to_dict(self, encode_json=False) ‑> Dict[str, dict | list | str | int | float | bool | None]
def to_json(self,
*,
skipkeys: bool = False,
ensure_ascii: bool = True,
check_circular: bool = True,
allow_nan: bool = True,
indent: int | str | None = None,
separators: Tuple[str, str] | None = None,
default: Callable | None = None,
sort_keys: bool = False,
**kw) ‑> str
class TextColStats (unique_token_count: int,
top_10_tokens: List,
avg_num_tokens: int,
min_num_tokens: int,
max_num_tokens: int,
median_num_tokens: int,
lang: str = 'not_analyzed')-
Captures the column stats for Text data_set type columns
Instance variables
var avg_num_tokens : int
-
Min length of unique tokens in text feature column
var lang : str
-
URL to fetch the TextColStats for this dataSet
var max_num_tokens : int
-
median length of the feature
var median_num_tokens : int
-
Language detected for this dataset
var min_num_tokens : int
-
max length of unique tokens in text feature column
var top_10_tokens : List
-
Average length of unique tokens in text feature column
var unique_token_count : int
-
top 10 tokens in the Feature Column
Methods
def display_in_hub(self)
def plot(self)
-
Publish and display the stats.