Lucene: Extracting Text From PowerPoint

In order to properly index Microsoft PowerPoint files using Lucene, you must be able to extract the text from the presentation. The following code example can be used for this purpose:

package ca.intelliware.example;

import java.io.InputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;

import org.apache.commons.logging.LogFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;

public class PowerPointTextExtractor implements POIFSReaderListener {

  private ByteArrayOutputStream writer;

  public synchronized String extractText(InputStream inputStream)
      throws IOException {
    POIFSReader reader = new POIFSReader();
    this.writer = new ByteArrayOutputStream();
    reader.registerListener(this);
    reader.read(inputStream);
    String contents = this.writer.toString();

    return contents;
  }

  public void processPOIFSReaderEvent(POIFSReaderEvent event) {

    try {
      if (isComponentWeCareAbout(event)) {
        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());
        processContent(buffer, 0, buffer.length);
      }
    } catch (IOException e) {
      e.printStackTrace();
    } catch (RuntimeException e) {
      e.printStackTrace();
    }
  }

  private boolean isComponentWeCareAbout(POIFSReaderEvent event) {
    return event.getName().equalsIgnoreCase("PowerPoint Document");
  }

  private void processContent(byte[] buffer, int beginIndex, int endIndex) {
    while (beginIndex < endIndex) {
      int containerFlag = LittleEndian.getUShort(buffer, beginIndex);
      int recordType = LittleEndian.getUShort(buffer, beginIndex + 2);
      long recordLength = LittleEndian.getUInt(buffer, beginIndex + 4);
      beginIndex += 8;
      if ((containerFlag & 0x0f) == 0x0f) {
        processContent(buffer, beginIndex, beginIndex + (int)recordLength);
      } else if (recordType == 4008) {
        this.writer.write(buffer, beginIndex, (int)recordLength);
        this.writer.write(' ');
      }
      beginIndex += (int)recordLength;
    }
  }
}

This example uses Jakarta POI to read the native PowerPoint document.

It's only fair to share...
Share on FacebookGoogle+Tweet about this on TwitterShare on LinkedIn

Leave a Reply