code for counting number of sentences, words and characters in an input file
I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)
lines,blanklines,sentences,words=0,0,0,0
num_chars=0
print '-'*50
try:
filename = 'sample.txt'
textf = open(filename,'r')c
except IOE开发者_StackOverflow中文版rror:
print 'cannot open file %s for reading' % filename
import sys
sys.exit(0)
for line in textf:
print line
lines += 1
if line.startswith('\n'):
blanklines += 1
else:
sentences += line.count('.')+ line.count ('!')+ line.count('?')
tempwords = line.split(None)
print tempwords
words += len(tempwords)
textf.close()
print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words
import nltk
import nltk.data
import nltk.tokenize
with open('sample.txt' , 'r') as f:
for line in f:
num_chars += len(line)
num_chars = num_chars - (words +1 )
pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
for line in f1:
#tokenised_words = nltk.tokenize.word_tokenize(line)
tokenizer = TreebankWordTokenizer()
tokenised_words = tokenizer.tokenize(line)
for w in tokenised_words:
if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars
pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?
import string
#
# Per-line counting functions
#
def countLines(ln): return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln): return len(ln.split())
def charCounter(validChars):
vc = set(validChars)
def counter(ln):
return sum(1 for ch in ln if ch in vc)
return counter
countSentences = charCounter('.!?')
countLetters = charCounter(string.letters)
countPunct = charCounter(string.punctuation)
#
# do counting
#
class FileStats(object):
def __init__(self, countFns, labels=None):
super(FileStats,self).__init__()
self.fns = countFns
self.labels = labels if labels else [fn.__name__ for fn in countFns]
self.reset()
def reset(self):
self.counts = [0]*len(self.fns)
def doFile(self, fname):
try:
with open(fname) as inf:
for line in inf:
for i,fn in enumerate(self.fns):
self.counts[i] += fn(line)
except IOError:
print('Could not open file {0} for reading'.format(fname))
def __str__(self):
return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))
fs = FileStats(
(countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
("Lines", "Blank Lines", "Sentences", "Words", "Letters", "Punctuation")
)
fs.doFile('sample.txt')
print(fs)
results in
Lines 101
Blank Lines 12
Sentences 48
Words 339
Letters 1604
Punctuation 455
You can also use a regex to replace all non-alphanumeric characters and then count the number of characters in each line.
Once thing you could do is when you read the line iterate through it and increment number of characters:
for character in line:
if character.isalnum():
num_chars += 1
P.S. you might want to change if statement condition to satisfy your particular needs, i.e. if you want to count $ for example.
Try this for count number of words and number of sentences and get probability for similar words,
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:
print(i) #array contain with tokens
print(str(len(i))) #word count
for j in i:
if j== 'words': #simple algo for count number of 'words' to be count
x = x+1
tokenized_sents = [sent_tokenize(k) for k in lines]
for k in tokenized_sents:
print("Sentences"+str(k)) #array contain with sentences
print("number of sentences "+str(len(k))) #number of sentences
print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))
精彩评论