Module markov.api.schemas.data_schema_impl
Classes
class ColComparison (confidence: float, mismatch: bool, message: str = '')
-
Column Comparison Details and how confident we are about this comparison
Class variables
var confidence : float
-
Set to true if there in-compatible columns are compared
var message : str
var mismatch : bool
-
Specific Message for comparison
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Methods
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class ColStats (mean: Union[float, str] = 'na', mode: Union[float, str] = 'na', max_value: Union[float, str] = 'na', min_value: Union[float, str] = 'na', median: Union[float, str] = 'na', kurtosis: Union[float, str] = 'na', variance: Union[float, str] = 'na', skewness: Union[float, str] = 'na', quartiles: Dict = builtins.dict)
-
Column Statistics for a DataSet
Class variables
var kurtosis : Union[float, str]
-
Variance of this dataset column if applicable
var max_value : Union[float, str]
-
min_value of this dataset column if applicable
var mean : Union[float, str]
-
Mode of this dataset column if applicable
var median : Union[float, str]
-
Kurtosis of this dataset column if applicable
var min_value : Union[float, str]
-
median of this dataset column if applicable
var mode : Union[float, str]
-
max_value of this dataset column if applicable
var quartiles : Dict
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
var skewness : Union[float, str]
var variance : Union[float, str]
-
Skewness of this dataset column if applicable
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
prop iqr
-
Returns
Range between Q3 (Quartile) and Q1 (Quartile)
Methods
def get_quartile(self, value: float) ‑> str
-
Which quartile data_set belongs to.
Args
value
:float
- what is the quartile of this datapoint.
Returns
Get the Quartile of this point.
def std_away(self, value: float) ‑> float
-
How many standard deviations is the point away from mean.
Args
value
:float
- input value to check.
Returns
How many std away is this value from the mean.
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class ColumnInfo (col_type: VisionsBaseType, info: Union[ColStats, TextColStats])
-
Type of column info to compare the column schema/values
Class variables
var col_type : visions.types.type.VisionsBaseType
var info : Union[ColStats, TextColStats]
Static methods
def create_from_json(json_str: str) ‑> ColumnInfo
-
Create the ColumnInfo from the serialized JSON string of this object.
Args
json_str (): Returns:
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Methods
def compare_type(self, another_col: ColumnInfo) ‑> ColComparison
-
Compare the given ColumnInfo with another column info.
Args
another_col
:ColumnInfo
- Another ColumnInfo we have to compare this given column info with.
Returns
ColComparison object that contains comparison info.
def get_json(self) ‑> str
-
Serialize this object to JSON
Returns
JSON Serialized string of this object
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class DataSchema (num_cols: int, col_info: List[ColumnInfo])
-
Place holder for DataSchema representation
Class variables
var col_info : List[ColumnInfo]
var num_cols : int
-
Column Signature of this dataset
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Methods
def compare(self, data_schema: DataSchema) ‑> DataSchemaComparisonResults
-
Compare with the other data_set schemas and return the mismatch based on
Args
data_schema
:DataSchema
- Another data_set schema to compare
Returns:
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class DataSchemaComparisonResults
-
DataSchemaComparisonResults is the outcome of comparison of two data_set schemas
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Instance variables
prop mismatched_cols
-
Columns that are mismatch based on ColComparison results
Returns
List of mismatched cols and details.
Methods
def add_mismatched(self, col_comparison: ColComparison)
-
Add columns that are mismatched and their details to the
Args
col_comparison
:ColComparison
- Col comparison result for a specific mismatched column.
def has_mismatch(self) ‑> bool
-
Returns
True if there is a mismatch detected in comparison
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class DescribeColumnResults (column_info: List[ColumnInfo], sampled_df: DataFrame)
-
DescribeColumnResults shows the column stats for the describe operation on the columns.
Class variables
var column_info : List[ColumnInfo]
-
if the data_set is sampled then sampled_df would be copy of the sampled dataset else it would be reference to original
var sampled_df : pandas.core.frame.DataFrame
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Methods
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable] = None, sort_keys: bool = False, **kw) ‑> str
class TextColStats (unique_token_count: int, top_10_tokens: List, avg_num_tokens: int, min_num_tokens: int, max_num_tokens: int, median_num_tokens: int, lang: str = 'not_analyzed')
-
Captures the column stats for Text data_set type columns
Class variables
var avg_num_tokens : int
-
Min length of unique tokens in text feature column
var lang : str
-
URL to fetch the TextColStats for this dataSet
var max_num_tokens : int
-
median length of the feature
var median_num_tokens : int
-
Language detected for this dataset
var min_num_tokens : int
-
max length of unique tokens in text feature column
var top_10_tokens : List
-
Average length of unique tokens in text feature column
var unique_token_count : int
-
top 10 tokens in the Feature Column
Methods
def display_in_hub(self)
def plot(self)
-
Publish and display the stats.