开发者

counting a set dictionary of words in a specific html tag

I am trying to parse a document and if there is a name associated with a specific docno, count the total number of names. After the for loop ends for that docno, I want to store names[docno]= word count. 开发者_Python百科Therefore, if namedict={'henry':'','joe':'') and henry is in docno=doc 1 -4 times and joe 6 that the dictionary would store it as ('doc 1': 10). So far, all I can figure out is counting the total number of names in the entire text file.

from xml.dom.minidom import *
import re
from string import punctuation
from operator import itemgetter

def parseTREC1 (atext):
    fc = open(atext,'r').read()
    fc = '<DOCS>\n' + fc + '\n</DOCS>'
    dom = parseString(fc)
    w_re = re.compile('[a-z]+',re.IGNORECASE)
    doc_nodes = dom.getElementsByTagName('DOC')
    namelist={'Matt':'', 'Earl':'', 'James':''}
    default=0
    indexdict={}
    N=10
    names={}
    words={}
    for doc_node in doc_nodes:
        docno = doc_node.getElementsByTagName('DOCNO')[0].firstChild.data
        cnt = 1
        for p_node in doc_node.getElementsByTagName('P'):
            p = p_node.firstChild.data
            words = w_re.findall(p)
            words_gen=(word.strip(punctuation).lower() for line in words
                   for word in line.split())
                for aword in words:
                    if aword in namelist:
                        names[aword]=names.get(aword, 0) + 1
print names

    #    top_words=sorted(names.iteritems(), key=lambda(word, count): (-count, word))[:N]

     #   for word, frequency in top_words:
    #        print "%s: %d" % (word, frequency)
        #print words + top_words
#print docno + "\t" + str(numbers)


parseTREC1('LA010189.txt')


I've cleaned up your code a bit to make it easier to follow. Here are a few comments and suggestions:

  • To answer the key question: you should be storing the count in names[docno] = names.get(docno, 0) + 1.
  • Use a defaultdict(int) instead of names.get(aword, 0) + 1 to accumlate the count.
  • Use set() for the namelist.
  • Adding the re.MULTILINE option to your regular expression should remove the need for line.split().
  • You didn't use your words_gen, was that an oversight?

I used this doc to test with, based on your code:

<DOC>
    <DOCNO>1</DOCNO>
    <P>groucho
       harpo
       zeppo</P>
    <P>larry
       moe
       curly</P>
</DOC>
<DOC>
    <DOCNO>2</DOCNO>
    <P>zoe
       inara
       kaylie</P>
    <P>mal
       wash
       jayne</P>
</DOC>

Here is a cleaned-up version of the code to count names in each paragraph:

import re
from collections import defaultdict
from string import punctuation
from xml.dom.minidom import *

RE_WORDS = re.compile('[a-z]+', re.IGNORECASE | re.M)

def parse(path, names):
    data = '<DOCS>' + open(path, 'rb').read() + '</DOCS>'
    tree = parseString(data)
    hits = defaultdict(int)
    for doc in tree.getElementsByTagName('DOC'):
        doc_no = 'doc ' + doc.getElementsByTagName('DOCNO')[0].firstChild.data
        for node in doc.getElementsByTagName('P'):
            text = node.firstChild.data
            words = (w.strip(punctuation).lower() 
                     for w in RE_WORDS.findall(text))
            hits[doc_no] += len(names.intersection(words))
    for item in hits.iteritems():
        print item

names = set(['zoe', 'wash', 'groucho', 'moe', 'curly'])
parse('doc.xml', names)

Output:

(u'doc 2', 2)
(u'doc 1', 3)
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜