Module `markov.api.preprocessors.nlp.text_preprocessor`

Classes

class TextPreProcessor

Static methods

def default_text_clean(ss: dask.dataframe.core.Series) ‑> dask.dataframe.core.Series

Apply most common cleaning operations on the string dataframe

Args

ss (dd.Series):input series with text to clean

Returns

Normalized series

def lowercase(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Convert text series to lower cased

Args

s : dd.Series: Input Text Series

Returns

all strings in the text lower-cased

def remove_all_brackets(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Remove all brackets from the text input

Args

s : dd.Series: Input Text Series

Returns

All brackets " {},[],()" removed

def remove_html_tags(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Remove HTML Tags from the input text

Args

s : dd.Series: input text series (DataFrame Series)

Returns

HTML tags removed series

def remove_punctuation(ss: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Remove punctuation from the string series. The series will be processed in place

Args

ss : dd.Series: Series to be processed in place

Returns

removed punctuation in the series

def remove_smileys(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Remove Smileys' from text

Args

s : dd.Series: input text series (DataFrame Series)

Returns

Smileys removed series

def remove_whitespace(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series

Remove white space from the input text

Args

s : dd.Series: remove the text from series

Returns

remove white space from the input text in the series

def replace_digits(s: dask.dataframe.core.Series | pandas.core.series.Series, symbol: str = '', only_blocks: bool = True) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Replace digits from the input string

Args

s : dd.Series: input text series
symbol : str: to replace with
only_blocks : bool: continuous digits only ex 1234

Returns

text with digits replaced with the symbol

def replace_regex(s: dask.dataframe.core.Series, regex: str, symbol: str)

def replace_stopwords(txt: str, words: Set[str], symbol: str = '') ‑> str

Replace stop words

Args

txt : str: input text to remove the stopwords from
words : Set: Set containing the STOP WORDS

symbol (str):replace the stopword from this input text

Returns

stop word removed text

def replace_urls(s: dask.dataframe.core.Series | pandas.core.series.Series, symbol: str = '') ‑> dask.dataframe.core.Series

Replace URLs from the input string

Args

s : dd.Series: input text series
symbol : str: to replace with

Returns

text with URLs replaced with the symbol

def tokenize(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series

Tokenize series

Args

s : dd.Series: input series

Returns

tokenized series

def uppercase(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series

Convert to upper case

Args

s : dd.Series: Input Text Series

Returns

all strings in the text upper-cased

Methods

def preprocess(self, df: dask.dataframe.core.DataFrame | pandas.core.frame.DataFrame)

def tokenize_remove_stopwords(self, s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series

Tokenize all the input strings and remove stopwords

Args

s : dd.Series: input series

Returns

normalized and tokenized series

def tweet_preprocessor(self)