Module markov.api.preprocessors.nlp.text_preprocessor
Classes
class TextPreProcessor-
Static methods
def default_text_clean(ss: dask.dataframe.core.Series) ‑> dask.dataframe.core.Series-
Apply most common cleaning operations on the string dataframe
Args
ss (dd.Series):input series with text to clean
Returns
Normalized series
def lowercase(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Convert text series to lower cased
Args
s:dd.Series- Input Text Series
Returns
all strings in the text lower-cased
def remove_all_brackets(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Remove all brackets from the text input
Args
s:dd.Series- Input Text Series
Returns
All brackets " {},[],()" removed
-
Remove HTML Tags from the input text
Args
s:dd.Series- input text series (DataFrame Series)
Returns
HTML tags removed series
def remove_punctuation(ss: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Remove punctuation from the string series. The series will be processed in place
Args
ss:dd.Series- Series to be processed in place
Returns
removed punctuation in the series
def remove_smileys(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Remove Smileys' from text
Args
s:dd.Series- input text series (DataFrame Series)
Returns
Smileys removed series
def remove_whitespace(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series-
Remove white space from the input text
Args
s:dd.Series- remove the text from series
Returns
remove white space from the input text in the series
def replace_digits(s: dask.dataframe.core.Series | pandas.core.series.Series,
symbol: str = '',
only_blocks: bool = True) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Replace digits from the input string
Args
s:dd.Series- input text series
symbol:str- to replace with
only_blocks:bool- continuous digits only ex 1234
Returns
text with digits replaced with the symbol
def replace_regex(s: dask.dataframe.core.Series, regex: str, symbol: str)def replace_stopwords(txt: str, words: Set[str], symbol: str = '') ‑> str-
Replace stop words
Args
txt:str- input text to remove the stopwords from
words:Set- Set containing the STOP WORDS
symbol (str):replace the stopword from this input text
Returns
stop word removed text
def replace_urls(s: dask.dataframe.core.Series | pandas.core.series.Series, symbol: str = '') ‑> dask.dataframe.core.Series-
Replace URLs from the input string
Args
s:dd.Series- input text series
symbol:str- to replace with
Returns
text with URLs replaced with the symbol
def tokenize(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series-
Tokenize series
Args
s:dd.Series- input series
Returns
tokenized series
def uppercase(s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series | pandas.core.series.Series-
Convert to upper case
Args
s:dd.Series- Input Text Series
Returns
all strings in the text upper-cased
Methods
def preprocess(self, df: dask.dataframe.core.DataFrame | pandas.core.frame.DataFrame)def tokenize_remove_stopwords(self, s: dask.dataframe.core.Series | pandas.core.series.Series) ‑> dask.dataframe.core.Series-
Tokenize all the input strings and remove stopwords
Args
s:dd.Series- input series
Returns
normalized and tokenized series
def tweet_preprocessor(self)