Module markov.api.preprocessors.nlp.text_preprocessor
Classes
class TextPreProcessor
-
Static methods
def default_text_clean(ss: dask.dataframe.core.Series) ‑> dask.dataframe.core.Series
-
Apply most common cleaning operations on the string dataframe
Args
ss (dd.Series):input series with text to clean
Returns
Normalized series
def lowercase(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Convert text series to lower cased
Args
s
:dd.Series
- Input Text Series
Returns
all strings in the text lower-cased
def remove_all_brackets(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Remove all brackets from the text input
Args
s
:dd.Series
- Input Text Series
Returns
All brackets " {},[],()" removed
-
Remove HTML Tags from the input text
Args
s
:dd.Series
- input text series (DataFrame Series)
Returns
HTML tags removed series
def remove_punctuation(ss: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Remove punctuation from the string series. The series will be processed in place
Args
ss
:dd.Series
- Series to be processed in place
Returns
removed punctuation in the series
def remove_smileys(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Remove Smileys' from text
Args
s
:dd.Series
- input text series (DataFrame Series)
Returns
Smileys removed series
def remove_whitespace(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series
-
Remove white space from the input text
Args
s
:dd.Series
- remove the text from series
Returns
remove white space from the input text in the series
def replace_digits(s: Union[dask.dataframe.core.Series, pandas.core.series.Series], symbol: str = '', only_blocks: bool = True) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Replace digits from the input string
Args
s
:dd.Series
- input text series
symbol
:str
- to replace with
only_blocks
:bool
- continuous digits only ex 1234
Returns
text with digits replaced with the symbol
def replace_regex(s: dask.dataframe.core.Series, regex: str, symbol: str)
def replace_stopwords(txt: str, words: Set[str], symbol: str = '') ‑> str
-
Replace stop words
Args
txt
:str
- input text to remove the stopwords from
words
:Set
- Set containing the STOP WORDS
symbol (str):replace the stopword from this input text
Returns
stop word removed text
def replace_urls(s: Union[dask.dataframe.core.Series, pandas.core.series.Series], symbol: str = '') ‑> dask.dataframe.core.Series
-
Replace URLs from the input string
Args
s
:dd.Series
- input text series
symbol
:str
- to replace with
Returns
text with URLs replaced with the symbol
def tokenize(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series
-
Tokenize series
Args
s
:dd.Series
- input series
Returns
tokenized series
def uppercase(s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> Union[dask.dataframe.core.Series, pandas.core.series.Series]
-
Convert to upper case
Args
s
:dd.Series
- Input Text Series
Returns
all strings in the text upper-cased
Methods
def preprocess(self, df: Union[dask.dataframe.core.DataFrame, pandas.core.frame.DataFrame])
def tokenize_remove_stopwords(self, s: Union[dask.dataframe.core.Series, pandas.core.series.Series]) ‑> dask.dataframe.core.Series
-
Tokenize all the input strings and remove stopwords
Args
s
:dd.Series
- input series
Returns
normalized and tokenized series
def tweet_preprocessor(self)