#/usr/local/bin/python
"""
Barron Notes Extractor
by Michael Huynh (mike@mikexstudios.com | http://www.mikexstudios.com)
-------------------------------
Purpose:
Makes a best effort extraction of text from HTML code of Barron Notes from
Pink Monkey. Pages from Pink Monkey were saved with wget
"""

import re
import dircache #for opening directory listdir()

#Making this script a class to whet my Java appetite

class BarronNotesExtractor:
    #Directory of saved notes
    dirOfNotes = './'
    #Extension of notes
    extOfNotes = 'html' #other extensions can be asp, php, htm, etc.
    #File of extracted notes
    extNotesFile = 'extracted.txt'

    #Initializing the class
    def __init__(self):
        self.processDirectory()
        return

    def extract(self, inDirtyHTML):
        matched = re.match('.*<HR>\n(.*)</CENTER>Paradise.*', inDirtyHTML, re.DOTALL)
        
        return matched.group(1)

    def processDirectory(self):
        
        #Open the directory
        filesInDir = dircache.listdir(self.dirOfNotes)
        #print filesInDir

        #Empty write to file
        outputFileHandler = open(self.extNotesFile, 'w')
        outputFileHandler.close()

        #For each file, place full HTML code in string and
        #send to extract()

        for eachFile in filesInDir:
            #Check for extension
            if(self.isExtensionGood(eachFile)):

                print 'Processing: '+eachFile+' .....',
                
                # Open file, read the text
                inputFileHandler = open(eachFile, 'r')
                #Read into string
                strFullFile = inputFileHandler.read()
                #Close File
                inputFileHandler.close()

                #Send string into extraction
                extractedText = self.extract(strFullFile)

                #More filtering
                extractedText = self.stripUselessHTML(extractedText)

                #Write extracted to a file
                outputFileHandler = open(self.extNotesFile, 'a') #for appending
                outputFileHandler.write(extractedText+'\n')
                outputFileHandler.close()

                print 'written to '+self.extNotesFile
                
            #Else ignore the file

    #Checks for good file extension (defined by extOfNotes)
    def isExtensionGood(self, inFilename):
        if(re.match('.*\.'+self.extOfNotes, inFilename)):
            #print inFilename
            return True
        else:
            return False

    def stripUselessHTML(self, inHalfHTML):
        #Remove links
        inHalfHTML = re.sub(re.compile('<a.*?>.*?</a>', re.IGNORECASE), '', inHalfHTML)

        #Convert <BR><BR> into two newlines
        inHalfHTML = re.sub('<BR><BR>', '\n', inHalfHTML)
        
        #Remove HTML tags
        inHalfHTML = re.sub('<.*?>', '', inHalfHTML)

        #Remove IMG tags
        inHalfHTML = re.sub(re.compile('<img.*?>', re.DOTALL), '', inHalfHTML)

        #Remove excessive whitespace
        inHalfHTML = re.sub('\s{7,}', '', inHalfHTML) #Adjust the value inside the brackets if needed

        #Remove annoying ' | '
        inHalfHTML = re.sub(' \| ', '', inHalfHTML)
        inHalfHTML = re.sub('\|\s', '', inHalfHTML)

        #Change quotes back
        inHalfHTML = re.sub('\&quot;', '"', inHalfHTML)
        
        return inHalfHTML



#Main execution
x = BarronNotesExtractor()
