开发者

Python-Wikipedia Automated Downloader

[Using Python 3.1] Does anyone have any idea how to make a Python 3 application allow the user to write a text file with multiple words separated with commas. The program should read the file, and download the Wikipedia page of the requested item. e.g. if they typed hello,python-3,chicken it would go to Wikipedia and download http://www.wikipedia.com/wiki/hello, http://www.wikip... Anyone think they can do this?

When I say "download" I m开发者_如何学Cean download the text, doesn't matter about images.


Look up urllib.request.


You described exactly how to make such a program. So what is the question?

You read the file, split on commas, and download the URL. Done!


Check the following code, it downloads the html, without the images, but you can access them from the xml file that is being parsed to get the url.

from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node

def main():
    print "Hello World"

    keywords = []

    key_file = open("example.txt", 'r')
    if key_file:
        temp_lines = key_file.readlines()

        for keyword_line in temp_lines:
            keywords.append(keyword_line.rstrip("\n"))

        key_file.close()

    print "Total keywords: %d" % len(keywords)
    for keyword in keywords:
        url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
        xmldoc = minidom.parse(urllib.urlopen(url))
        root_node = xmldoc.childNodes[0]

        section_node = None
        for node in root_node.childNodes:
            if node.nodeType == Node.ELEMENT_NODE and \
            node.nodeName == "Section":
                section_node = node
                break

        if section_node is not None:
            items = []
            for node in section_node.childNodes:
                if node.nodeType == Node.ELEMENT_NODE and \
                node.nodeName == "Item":
                    items.append(node)

            if len(items) == 0:
                print "NO results found"
            else:
                print "\nResults found for " + keyword + ":\n"
                for item in items:
                    for node in item.childNodes:
                        if node.nodeType == Node.ELEMENT_NODE and \
                        node.nodeName == "Text":
                            if len(node.childNodes) == 1:
                                print node.childNodes[0].data.encode('utf-8')

                file_name = None
                for node in items[0].childNodes:
                    if node.nodeType == Node.ELEMENT_NODE and \
                    node.nodeName == "Text":
                        if len(node.childNodes) == 1:
                            file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
                            break

                if file_name is not None:
                    file = open(file_name, 'w')
                    if file:
                        for node in items[0].childNodes:
                            if node.nodeType == Node.ELEMENT_NODE and \
                            node.nodeName == "Url":
                                if len(node.childNodes) == 1:
                                    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
                                    header = { 'User-Agent' : user_agent }
                                    request = urllib2.Request(url=node.childNodes[0].data, headers=header)
                                    file.write(urllib2.urlopen(request).read())
                                    file.close()
                                    break


    print "Sleeping"
    sleep(2)

if __name__ == "__main__":
    main()
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜