#!/usr/bin/python # Created: 05.07.2000 # Last checked: 15.04.2010 # Note: www.dict.cc support is currently broken since htmlTableParse.py can't handle it :-( # The script was formerly known as leo.py, it is now called x-dict # You can access the following dictionaries now: # command line switch URL language # --leo dict.leo.org : english <-> german # --dict_cc www.dict.cc : english <-> german # Author: Stefan Reichoer, stefan@xsteve.at # TODO: # - implement useful cache handling - it is not used for dict.cc at the moment # - eventually revive the pymacs interface # - add more dictionaries # Please report, if the webpage of dict.leo.org changes # and this script does no longer work! # ---------------------------------------------------------------------------------------------------- # Content of htmlTableParse '''\ HTML Table Parser htmlTableParse.py jjk 01/13/1998 001 from CTtableParse.py 002b jjk 01/14/1998 002 use UserList jjk 02/03/1998 003 rename (was CThtmlTableParse), split out tests jjk 10/10/1998 004 add test() back in, add some utillities and improve docs jjk 01/05/1999 005 fix/improve RemoveTags() add ConvertSpecialCharacters() str 10/08/2001 006 Use module re instead of regex str 22/09/2008 007 Integrate in x-dict The ParsedDocument class breaks up an HTML string by tables, rows, and columns. (the HTML code must have openening and closing table TR and TD tags) The parsed data is retreived as a list of lists of lists, i.e. table4 = aParsedDocument[3] table4Row1 = aParsedDocument[3][0] table4Row1Col3 = aParsedDocument[3][0][2] example usage: import htmlTableParse pd = htmlTableParse.ParsedDocument(htmlSourceString) numberOfTables = len(pd) contentsOfFithColumnOfThirdRowOfSecondTable = pd[1][2][4] some useful functions: ParseOpenFile(fileStream) #answer a ParsedDocument instance for contents of open file ParseFile(fileName) #answer a ParsedDocument instance for contents of a file ParseURL(url) #answer a ParsedDocument instance for contents of a url RemoveTags(htmlString) #remove all html tags from a string ConvertSpecialCharacters(htmlString) #remove all special ampersand characters from a string see also test() function *** use this code at your own risk *** *** some code may have been borrowed from other python modules *** *** the programmer is a re newbie - this may not be the optimal solution :-) *** ''' import re, string, sys, UserList, urllib TraceFlag = 0 Re1 = '[Tt][Aa][Bb][Ll][Ee]' TableStart=re.compile('<[ \t]*'+Re1+'[^<]*>') TableEnd=re.compile('<[ \t]*/'+Re1+'[ \t]*>') Re2 = '[Tt][Rr]' RowStart=re.compile('<[ \t]*'+Re2+'[^<]*>') RowEnd=re.compile('<[ \t]*/'+Re2+'[ \t]*>') Re3 = '[Tt][Dd]' ColStart=re.compile('<[ \t]*'+Re3+'[^<]*>') ColEnd=re.compile('<[ \t]*/'+Re3+'[ \t]*>') TagRe=re.compile('<[^<]*>') AmpChars = [ (re.compile('&[Nn][Bb][Ss][Pp];'),' ') ,(re.compile('&[Aa][Mm][Pp];'),'&') ] class ParsedDocument(UserList.UserList): '''jjk 01/14/98''' def __init__(self, htmlSrc=''): '''jjk 01/14/98''' UserList.UserList.__init__(self) self._parseContents(htmlSrc) def report(self, outs, prefix=''): '''jjk 01/13/98''' i1 = 0 for item in self.data: item.report(outs, prefix+str(i1)+'>') i1 = i1 + 1 def reportStructure(self, outs, prefix=''): '''jjk 10/10/98''' outs.write('ParsedDocument: %d tables\n'%len(self)) for i1 in range(len(self)): table = self[i1] outs.write('\tTable #%d: %d rows\n'%(i1,len(table))) for i2 in range(len(table)): row = table[i2] outs.write('\t\tRow #%d: %d columns\n'%(i2,len(row))) def _parseParams(self): '''jjk 01/13/98''' return(TableStart, TableEnd, ParsedTable) def _parseContents(self, htmlSrc): '''jjk 01/13/98''' startRegex, endRegex, contentClass = self._parseParams() hs = htmlSrc while (1): p1 = startRegex.search(hs) #print dir(p1),p1.start(),p1.end() if (p1==None): break p1=p1.start() hs = hs[p1:] p2 = endRegex.search(hs) #if (p2<0): p2 = len(hs)+1 if (p2==None): p2 = len(hs)+1 else: p2 = p2.start() self.append(contentClass(hs[:p2])) hs = hs[p2:] def tables(self): '''jjk 01/13/98''' return(self.data) class ParsedTable(ParsedDocument): '''jjk 01/13/98''' def _parseParams(self): '''jjk 01/13/98''' return(RowStart, RowEnd, ParsedRow) def rows(self): '''jjk 01/13/98''' return(self.data) class ParsedRow(ParsedTable): '''jjk 01/13/98''' def _parseParams(self): '''jjk 01/13/98''' return(ColStart, ColEnd, ParsedColumn) def columns(self): '''jjk 01/13/98''' return(self.data) class ParsedColumn: '''jjk 01/13/98''' def __init__(self, htmlSrc=''): '''jjk 01/13/98''' self.contents = '' self._parseContents(htmlSrc) def __repr__(self): '''jjk 01/13/98''' return(self.contents) def _parseParams(self): '''jjk 01/13/98''' return(TableStart, ColEnd, ParsedTable) def _parseContents(self, htmlSrc): '''jjk 01/13/98''' hs = htmlSrc p1a = ColStart.search(hs) if (p1a==None): return p1a=p1a.start() p1b = string.find(hs,'>',p1a) #p1b=p1b.start() hs = hs[p1b+1:] p2 = ColEnd.search(hs) #if (p2<0): p2 = len(hs)+1 if (p2==None): p2 = len(hs)+1 else: p2 = ps.start() self.contents = string.strip(hs[:p2]) def report(self, outs, prefix=''): '''jjk 01/13/98''' outs.write(prefix+self.contents+'\n') def RemoveTags(htmlString): '''remove all html tags from a string jjk 01/05/99''' hs = htmlString while(1): if TraceFlag: print '~', hs p1 = TagRegex.search(hs) if (p1<0): break p2 = p1 + TagRegex.match(hs[p1:]) if TraceFlag: print '~~',p1, p2, hs[p1:p2] hs = hs[:p1] + ' ' + hs[p2:] if TraceFlag: raw_input('z') return(hs) def ConvertSpecialCharacters(htmlString): '''remove all special ampersand characters from a string jjk 01/05/99''' hs = htmlString for ac in AmpChars: while(1): if TraceFlag: print '~', hs p1 = ac[0].search(hs) if (p1<0): break p2 = p1 + ac[0].match(hs[p1:]) if TraceFlag: print '~~',p1, p2, hs[p1:p2] hs = hs[:p1] + ac[1] + hs[p2:] if TraceFlag: raw_input('z') return(hs) def ParseOpenFile(fileStream=sys.stdin): '''public: answer a ParsedDocument instance for contents of open file jjk 10/10/98''' fileData = fileStream.read() parsedDocument = ParsedDocument(fileData) return(parsedDocument) def ParseFile(fileName): '''public: answer a ParsedDocument instance for contents of a file jjk 10/10/98''' fileStream = open(fileName) parsedDocument = ParseOpenFile(fileStream) fileStream.close() return(parsedDocument) def ParseURL(url,proxy=None): '''public: answer a ParsedDocument instance for contents of a url jjk 10/10/98''' if proxy: import os os.environ['http_proxy'] = proxy fileName, msg = urllib.urlretrieve(url) return(ParseFile(fileName)) # # End of htmlTableParse.py # ---------------------------------------------------------------------------------------------------- # begin of x-dict import os, time, cPickle, optparse XDICT_VERSION = "2010-04-15" DEBUG_ACTIVE = os.environ.has_key("INSIDE_EMACS") # -------------------------------------------------------------------------------- # Command line processing # -------------------------------------------------------------------------------- parser = optparse.OptionParser(usage="""usage: %prog [options] query""") parser.add_option("", "--leo", action="store_true", dest="dict_leo", default=False, help="Use leo.dict.org, the default") parser.add_option("", "--dict_cc", action="store_true", dest="dict_dict_cc", default=False, help="Use www.dict.cc") parser.add_option("", "--http_proxy", dest="http_proxy", default=None, help="A proxy server") parser.add_option("", "--coding", dest="coding", default=None, help="Use the given coding system") parser.add_option("-c", "--use-cache", action="store_true", dest="use_cache", default=False, help="Cache the results on disk") parser.add_option("-w", "--column-width", dest="colwidth", type="int", default=50, help="The output column width") parser.add_option("-n", "--do-nothing", action="store_true", dest="do_nothing", default=False, help="Do nothing, used for debugging") parser.add_option("", "--version", action="store_true", dest="show_version", default=False, help="Show x-dict version info") if DEBUG_ACTIVE: sys.argv = ["x-dict", "car"] # sys.argv.append("--dict_cc") (options, args) = parser.parse_args() if options.show_version: print "x-dict v%s" % XDICT_VERSION sys.exit() if len(args) != 1: parser.print_help() sys.exit() options.colwidth1 = options.colwidth options.colwidth2 = options.colwidth gLineFormatString = "%%-%ds%%-%ds" % (options.colwidth1, options.colwidth2) if not (options.dict_leo or options.dict_dict_cc): options.dict_leo = True gCacheFilename = None def process_options(): global gCacheFilename if options.use_cache: gCacheFilename = os.getenv("HOME")+"/"+".xdict.cache" # -------------------------------------------------------------------------------- def open_wcache(): if gCacheFilename: global wcache try: wcache = cPickle.load(open(gCacheFilename,"r")) except: wcache = {} #print "wcache:",wcache wcache = {} # Tell the server, that we are x-dict class XDictUrlOpener(urllib.FancyURLopener): version = "x-dict/%s" % XDICT_VERSION urllib._urlopener = XDictUrlOpener() # -------------------------------------------------------------------------------- # Common dictionary class # -------------------------------------------------------------------------------- class Dict: def wash(self, str): str = repr(str) # wash the bold face str = string.replace(str,"","") str = string.replace(str,"","") str = string.replace(str,"","") str = string.replace(str,"","") # wash the italic face str = string.replace(str,"","") str = string.replace(str,"","") # wash the italic face: they now use the small i tag also str = string.replace(str,"","") str = string.replace(str,"","") # wash smaller fonts str = string.replace(str,"","") str = string.replace(str,"","") # wash kbd str = string.replace(str,"","") str = string.replace(str,"","") # wash var str = string.replace(str,"","") str = string.replace(str,"","") # wash sup sup_re_start = re.compile("(.+?)",re.IGNORECASE) str = sup_re_start.sub(" [\\1] ",str) # wash the links href_re_start = re.compile("",re.IGNORECASE) str = href_re_start.sub("",str) str = string.replace(str,"","") str = string.replace(str,"","") # wash links to images str = re.sub("","",str) # wash str = re.sub("","",str) str = string.replace(str,"","") # wash str = re.sub('',"",str) # remove   str = string.replace(str," ","") # remove chr(A0) str = string.replace(str, chr(0xA0),"") # remove   str = string.replace(str," ","") return str def show(self, wl): for eng, ger in wl: str = (gLineFormatString % (eng, ger)).strip() if options.coding: str = str.decode(options.coding) print str # -------------------------------------------------------------------------------- # Support for dict.leo.org # -------------------------------------------------------------------------------- class Dict_leo_org(Dict): def search(self, query): query = query.replace(" ","+") global wcache if wcache.has_key(query): t,wl = wcache[query] print "Using data from cache (%s)" % time.strftime("%c",time.localtime(t)) else: #noglob wget -O ~/tmp/leo.html http://dict.leo.org/?search=crucial # pd = ParseFile(os.path.expanduser('~/tmp/leo.html')) pd = ParseURL('http://dict.leo.org/?search='+query,options.http_proxy) self.pd = pd wl = self.parse_result(pd) if gCacheFilename: wcache[query] = (time.time(), wl) cPickle.dump(wcache, open(gCacheFilename, "w")) return wl def parse_result(self,pd): wordlist = pd[7] wl=[] for i in range(2,len(wordlist)): if len(wordlist[i]) > 3: eng = self.wash(wordlist[i][1]) ger = self.wash(wordlist[i][3]) if len(eng) > 0 or len(ger) > 0: #print "%-40s%-39s"%(eng,ger) wl.append((eng,ger)) return wl # -------------------------------------------------------------------------------- # Support for www.dict.cc # -------------------------------------------------------------------------------- class Dict_dict_cc(Dict): def search(self,query): query = query.replace(" ","+") # noglob wget -O ~/tmp/di.html http://www.dict.cc/?s=crucial # pd = ParseFile(os.path.expanduser('~/tmp/di.html')) pd = ParseURL('http://www.dict.cc/?s='+query,options.http_proxy) self.pd = pd wl = self.parse_result(pd) return wl def parse_result(self,pd): wordlist = pd[2] wl=[] for i in range(1, len(wordlist)): if len(wordlist[i]) > 1: eng = self.wash(wordlist[i][1]) ger = self.wash(wordlist[i][2]) if len(eng) > 0 or len(ger) > 0: #print "%-40s%-39s"%(eng, ger) wl.append((eng,ger)) return wl # -------------------------------------------------------------------------------- # Emacs interface via pymacs ... not up to date at the moment # -------------------------------------------------------------------------------- interactions = {} def wl_as_string(wl): s = "" for eng,ger in wl: s += gLineFormatString%(eng,ger)+"\n" return s def search(string): return wl_as_string(search_internal_leo(string)) interactions[search]="sSearch in dict.leo.org: " # -------------------------------------------------------------------------------- # Lookup the word if called as script # -------------------------------------------------------------------------------- if __name__ == "__main__": process_options() open_wcache() if not options.do_nothing: search_string = string.join(args) # search_string = "Fahrrad" if options.dict_leo: d = Dict_leo_org() elif options.dict_dict_cc: d = Dict_dict_cc() wl=d.search(search_string) d.show(wl) # Local Variables: # mode: python # End: # arch-tag: 32c9c2f3-c41e-4062-a2be-71c0039e583d