Module markov.api.preprocessors.nlp.text_preprocessor

Classes

class TextPreProcessor

Static methods

def default_text_clean(ss: dask.dataframe.core.Series) ‑> dask.dataframe.core.Series

Apply most common cleaning operations on the string dataframe

Args

ss (dd.Series):input series with text to clean

Returns

Normalized series

def lowercase(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Convert text series to lower cased

Args

s : dd.Series
Input Text Series

Returns

all strings in the text lower-cased

def remove_all_brackets(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Remove all brackets from the text input

Args

s : dd.Series
Input Text Series

Returns

All brackets " {},[],()" removed

def remove_html_tags(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Remove HTML Tags from the input text

Args

s : dd.Series
input text series (DataFrame Series)

Returns

HTML tags removed series

def remove_punctuation(ss: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Remove punctuation from the string series. The series will be processed in place

Args

ss : dd.Series
Series to be processed in place

Returns

removed punctuation in the series

def remove_smileys(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Remove Smileys' from text

Args

s : dd.Series
input text series (DataFrame Series)

Returns

Smileys removed series

def remove_whitespace(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series

Remove white space from the input text

Args

s : dd.Series
remove the text from series

Returns

remove white space from the input text in the series

def replace_digits(s: Union[dask.dataframe.core.Series, pandas.core.series.Series], symbol: str = '', only_blocks: bool = True) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Replace digits from the input string

Args

s : dd.Series
input text series
symbol : str
to replace with
only_blocks : bool
continuous digits only ex 1234

Returns

text with digits replaced with the symbol

def replace_regex(s: dask.dataframe.core.Series, regex: str, symbol: str)
def replace_stopwords(txt: str, words: Set[str], symbol: str = '') ‑> str

Replace stop words

Args

txt : str
input text to remove the stopwords from
words : Set
Set containing the STOP WORDS

symbol (str):replace the stopword from this input text

Returns

stop word removed text

def replace_urls(s: Union[dask.dataframe.core.Series, pandas.core.series.Series], symbol: str = '') ‑> dask.dataframe.core.Series

Replace URLs from the input string

Args

s : dd.Series
input text series
symbol : str
to replace with

Returns

text with URLs replaced with the symbol

def tokenize(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series

Tokenize series

Args

s : dd.Series
input series

Returns

tokenized series

def uppercase(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]

Convert to upper case

Args

s : dd.Series
Input Text Series

Returns

all strings in the text upper-cased

Methods

def preprocess(self, df: Union[dask.dataframe.core.DataFrame, pandas.core.frame.DataFrame])
def tokenize_remove_stopwords(self, s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series

Tokenize all the input strings and remove stopwords

Args

s : dd.Series
input series

Returns

normalized and tokenized series

def tweet_preprocessor(self)