python getelementbyid from string
I have the following program, that is trying to upload a file (or files) to an image upload site, however I am struggling to find out how to parse the returned HTML to grab the direct link (contained in a <dd class="download"><input type="text" value="{hereisthelink}"></dd> ).
I have the code below:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pycurl
import urllib
import urlparse
import xml.dom.minidom
import StringIO
import sys
import gtk
import os
import imghdr
import locale
import gettext
try:
    import pynotify
except:
    print "Install pynotify. It's whoasome!"
APP="Uploadir Uploader"
DIR="locale"
locale.setlocale(locale.LC_ALL, '')
gettext.bindtextdomain(APP, DIR)
gettext.textdomain(APP)
_ = gettext.gettext
##STRINGS
uploading = _("Uploading image to Uploadir.")
oneimage = _("1 image has been successfully uploaded.")
multimages = _("images have been successfully uploaded.")
uploadfailed = _("Unable to upload to Uploadir.")
class Uploadir:
    def __init__(self, args):
        self.images = []
        self.urls = []
        self.broadcasts = []
        self.username=""
        self.password=""
        if len(args) == 1:
            return
        else:
            for file in args:
                if file == args[0] or file == "":
                    continue
                if file.startswith("-u"):
                    self.username = file.split("-u")[1]
                    #print self.username
                    continue
                if file.startswith("-p"):
                    self.password = file.split("-p")[1]
                    #print self.password
                    continue
                self.type = imghdr.what(file)
                self.images.append(file)
        for file in self.images:
            self.upload(file)
        self.setClipBoard()
        self.broadcast(self.broadcasts)
    def broadcast(self, l):
        try:
            str = '\n'.join(l)
            n = pynotify.Notification(str)
            n.set_urgency(pynotify.URGENCY_LOW)
            n.show()
        except:
            for line in l:
                print line
    d开发者_如何学JAVAef upload(self, file):
        #Try to login
            cookie_file_name = "/tmp/uploadircookie"
        if ( self.username!="" and self.password!=""):
            print "Uploadir authentication in progress"
            l=pycurl.Curl()
            loginData = [ ("username",self.username),("password", self.password), ("login", "Login") ]
            l.setopt(l.URL, "http://uploadir.com/user/login")
            l.setopt(l.HTTPPOST, loginData)
            l.setopt(l.USERAGENT,"User-Agent: Uploadir (Python Image Uploader)")
            l.setopt(l.FOLLOWLOCATION,1)
            l.setopt(l.COOKIEFILE,cookie_file_name)
            l.setopt(l.COOKIEJAR,cookie_file_name)
            l.setopt(l.HEADER,1)
            loginDataReturnedBuffer = StringIO.StringIO()
            l.setopt( l.WRITEFUNCTION, loginDataReturnedBuffer.write )
            if l.perform():
                self.broadcasts.append("Login failed. Please check connection.")
                l.close()
                return
            loginDataReturned = loginDataReturnedBuffer.getvalue()
            l.close()
            #print loginDataReturned
            if loginDataReturned.find("<li>Your supplied username or password is invalid.</li>")!=-1:
                self.broadcasts.append("Uploadir authentication failed. Username/password invalid.")
                return
            else:
                self.broadcasts.append("Uploadir authentication successful.")
            #cookie = loginDataReturned.split("Set-Cookie: ")[1]
            #cookie = cookie.split(";",0)
            #print cookie
        c = pycurl.Curl()
        values = [
                ("file", (c.FORM_FILE, file)),
                ("terms", "1"),
                ("submit", "submit")
             ]
        buf = StringIO.StringIO()
        c.setopt(c.URL, "http://uploadir.com/file/upload")
        c.setopt(c.HTTPPOST, values)
        c.setopt(c.COOKIEFILE, cookie_file_name)
        c.setopt(c.COOKIEJAR, cookie_file_name)
        c.setopt(c.WRITEFUNCTION, buf.write)
        if c.perform():
            self.broadcasts.append(uploadfailed+" "+file+".")
            c.close()
            return
        self.result = buf.getvalue()
        #print self.result
        c.close()
        doc = urlparse.urlparse(self.result)
        print doc
        self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
    def setClipBoard(self):
        c = gtk.Clipboard()
        c.set_text('\n'.join(self.urls))
        c.store()
        if len(self.urls) == 1:
            self.broadcasts.append(oneimage)
        elif len(self.urls) != 0:
            self.broadcasts.append(str(len(self.urls))+" "+multimages)
if __name__ == '__main__':
    uploadir = Uploadir(sys.argv)
The code that deals with the HTML parsing is here:
doc = urlparse.urlparse(self.result)
self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
The urlparse module has nothing to do with parsing HTML.  All it does is break a URL up into bits: protocol, network address, path, etc.  For example:
>>> urlparse.urlparse("http://www.stackoverflow.com/questions/4699888")
ParseResult(scheme='http', netloc='www.stackoverflow.com', path='/questions/4699888', params='', query='', fragment='')
For parsing HTML, try BeautifulSoup.
 
         加载中,请稍侯......
 加载中,请稍侯......
      
精彩评论