NLTK

#language #natural #nltk #notes #processing #python #setup #text #tokenization

Install

pip3 install nltk
import nltk
nltk.download("all")

Tokenizing

sent_tokenize -> Sentence Tokenizer, splits sentences in a body of text
word_tokenize -> Word Tokenizer, splits words in a sentence

from nltk.tokenize import sent_tokenize, word_tokenize

sampleText = "your text file"
print(sent_tokenize(sampletext))
print(word_tokenize(sampletext))

Chunking

Grouping words in meaningful grpups

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
txt = "Kids are playing. Kids like to play games. He got played"

custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)

for i in tokenizedText:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
		
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tag)
    #chunked.draw()
    chunked.pretty_print()

Stop Words

stop words are words that dont contribute a lot the text, or filler words like "a", "the"...

this words are removed so that text can be made easier for machine to understand

stopwords.words("english")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

txt = "your text file / input"

words = word_tokenize(txt)

nw = [i for i in words if i not in stopwords.words("english")]

Named Entity Recognition

import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
# state_union data set will be used for training PunktSentenceTokenizer to create a custom tokenizer

txt = "Your text file or input"

custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)

for i in tokenizedText:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
    nameEnt = nltk.ne_chunk(tag)
    nameEnt.pretty_print()

Stemming

Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.

ps.stem("word")

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

txt = "Kids are playing. Kids like to play games. He got played"
wt = word_tokenize(txt)
print([ps.stem(i) for i in wt])

“Untitled 43.png” could not be found.

Lemmatizing

A very similar operation to stemming is called lemmatizing. The major difference between these is, stemming can often create non-existent words, whereas lemmas are actual words.

pos = "a" ⇒ Adjective

pos = "v" ⇒ Verb

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

Part of Speech Tagging

import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
# state_union data set will be used for training PunktSentenceTokenizer to create a custom tokenizer

txt = "Kids are playing. Kids like to play games. He got played"

custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)

for i in tokenizedText:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
    print(tag)

state_union.raw("2006-GWBush.txt") This is the txt dataset we will be using for training, other are available, you can use your own.

Example output

[('Kids', 'NNS'), ('are', 'VBP'), ('playing', 'VBG'), ('.', '.')]
[('Kids', 'NNS'), ('like', 'IN'), ('to', 'TO'), ('play', 'VB'), ('games', 'NNS'), ('.', '.')]
[('He', 'PRP'), ('got', 'VBD'), ('played', 'JJ')]