code for counting number of sentences, words and characters in an input file

2023-02-13 12:37 问答作者：

I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)

lines,blanklines,sentences,words=0,0,0,0
num_chars=0

print '-'*50

try:
    filename = 'sample.txt'
    textf = open(filename,'r')c
except IOE开发者_StackOverflow中文版rror:
    print 'cannot open file %s for reading' % filename
    import sys
    sys.exit(0)

for line in textf:
    print line
    lines += 1
    if line.startswith('\n'):
        blanklines += 1
    else:

    sentences += line.count('.')+ line.count ('!')+ line.count('?')

    tempwords = line.split(None)
    print tempwords 
    words += len(tempwords)


textf.close()

print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words

import nltk
import nltk.data
import nltk.tokenize

with open('sample.txt' , 'r') as f:
    for line in f:
        num_chars += len(line)

num_chars = num_chars - (words +1 )

pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
    for line in f1:
        #tokenised_words = nltk.tokenize.word_tokenize(line)
        tokenizer = TreebankWordTokenizer()
        tokenised_words = tokenizer.tokenize(line)
    for w in tokenised_words:
        if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
            pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars

pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?

import string

#
# Per-line counting functions
#
def countLines(ln):      return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln):      return len(ln.split())

def charCounter(validChars):
    vc = set(validChars)
    def counter(ln):
        return sum(1 for ch in ln if ch in vc)
    return counter
countSentences = charCounter('.!?')
countLetters   = charCounter(string.letters)
countPunct     = charCounter(string.punctuation)

#
# do counting
#
class FileStats(object):
    def __init__(self, countFns, labels=None):
        super(FileStats,self).__init__()
        self.fns    = countFns
        self.labels = labels if labels else [fn.__name__ for fn in countFns]
        self.reset()

    def reset(self):
        self.counts = [0]*len(self.fns)

    def doFile(self, fname):
        try:
            with open(fname) as inf:
                for line in inf:
                    for i,fn in enumerate(self.fns):
                        self.counts[i] += fn(line)
        except IOError:
            print('Could not open file {0} for reading'.format(fname))

    def __str__(self):
        return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))

fs = FileStats(
    (countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
    ("Lines",    "Blank Lines",   "Sentences",    "Words",    "Letters",    "Punctuation")
)
fs.doFile('sample.txt')
print(fs)

results in

Lines                   101
Blank Lines              12
Sentences                48
Words                   339
Letters                1604
Punctuation             455

You can also use a regex to replace all non-alphanumeric characters and then count the number of characters in each line.

Once thing you could do is when you read the line iterate through it and increment number of characters:

for character in line:
    if character.isalnum():
        num_chars += 1

P.S. you might want to change if statement condition to satisfy your particular needs, i.e. if you want to count $ for example.

Try this for count number of words and number of sentences and get probability for similar words,

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:

    print(i) #array contain with tokens
    print(str(len(i))) #word count

    for j in i:
        if j== 'words': #simple algo for count number of 'words' to be count
            x = x+1

tokenized_sents = [sent_tokenize(k) for k in lines]

for  k in tokenized_sents:
    print("Sentences"+str(k)) #array contain with sentences
    print("number of sentences "+str(len(k))) #number of sentences

print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))

继续阅读：nltk python

code for counting number of sentences, words and characters in an input file

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？