Textractor API textractor-720 (20091120123250)

textractor.html
Class AbstractHtml2Text

java.lang.Object
  extended by textractor.html.AbstractHtml2Text
Direct Known Subclasses:
Html2Text2DB

public abstract class AbstractHtml2Text
extends Object

Converts HTML to Text. This translator uses the alt text of images to replace images. This is useful since many journals use images for greek symbol, and use an alt attribute to render this for text-only browsers.


Field Summary
protected  boolean appendSentencesInOneDocument
           
protected  int articleChunkSize
           
protected  boolean noSentenceBoundaryTag
           
protected  boolean verbose
           
 
Constructor Summary
AbstractHtml2Text(String[] args)
           
 
Method Summary
protected  Collection<Sentence> createSentenceOneDocument(Article article, Iterator<MutableString> sentencesAsTextIterator, String title)
           
protected  Collection<Sentence> createSentences(Article article, Iterator<MutableString> sentencesAsTextIterator, String title)
           
abstract  TextConsumer getConsumer()
           
protected  Collection<Sentence> loadArticleSentences(Article article, String title, String text, Map<String,Object> additionalFieldsMap)
           
 void process(String[] args)
           
abstract  void setConsumer(TextConsumer consumer)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

appendSentencesInOneDocument

protected boolean appendSentencesInOneDocument

noSentenceBoundaryTag

protected boolean noSentenceBoundaryTag

verbose

protected final boolean verbose

articleChunkSize

protected int articleChunkSize
Constructor Detail

AbstractHtml2Text

public AbstractHtml2Text(String[] args)
Method Detail

process

public void process(String[] args)
             throws IOException,
                    org.htmlparser.util.ParserException,
                    ConfigurationException,
                    SentenceProcessingException
Throws:
IOException
org.htmlparser.util.ParserException
ConfigurationException
SentenceProcessingException

loadArticleSentences

protected final Collection<Sentence> loadArticleSentences(Article article,
                                                          String title,
                                                          String text,
                                                          Map<String,Object> additionalFieldsMap)
                                                   throws SentenceProcessingException
Throws:
SentenceProcessingException

createSentenceOneDocument

protected final Collection<Sentence> createSentenceOneDocument(Article article,
                                                               Iterator<MutableString> sentencesAsTextIterator,
                                                               String title)
                                                        throws SentenceProcessingException
Throws:
SentenceProcessingException

createSentences

protected final Collection<Sentence> createSentences(Article article,
                                                     Iterator<MutableString> sentencesAsTextIterator,
                                                     String title)
                                              throws SentenceProcessingException
Throws:
SentenceProcessingException

setConsumer

public abstract void setConsumer(TextConsumer consumer)

getConsumer

public abstract TextConsumer getConsumer()

Textractor API textractor-720 (20091120123250)

Copyright © 2003-2008 Institute for Computational Biomedicine, All Rights Reserved.