Extract Image from PDF using Java
I need to extract bar-code from PDF only (using rectangle), not converting 开发者_JAVA技巧the whole PDF into image.
The image format can be jpg/png.
You can use Pdfbox
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
Map images = resources.getImages();
if( images != null )
{
Iterator imageIter = images.keySet().iterator();
while( imageIter.hasNext() )
{
String key = (String)imageIter.next();
PDXObjectImage image = (PDXObjectImage)images.get( key );
String name = getUniqueFileName( key, image.getSuffix() );
System.out.println( "Writing image:" + name );
image.write2file( name );
}
}
}
Reference source code
With PDF Box, without coding:
"$JAVA_HOME/bin/java" -jar pdfbox-app-1.8.2.jar PDFToImage foo.pdf
To do a batch processing:
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.List;
import java.util.Observer;
import org.apache.pdfbox.PDFToImage;
public class Main {
static {
System.setProperty(
"org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog" );
}
public static int extract( List< File > files, File jpegDir, Observer observer ) {
jpegDir.mkdirs();
int done = 0;
for( final File file : files ) {
try {
final File target = new File( jpegDir, file.getName());
final String trgtPath = target.getPath();
final String prefix = trgtPath.substring( 0, trgtPath.lastIndexOf( '.' ));
PDFToImage.main( new String[]{ "-outputPrefix", prefix, file.getPath() });
final double percent = (100.0 * ++done ) / files.size();
System.out.printf( "%6.2f %%: %s\n", percent, file.getName());
if( observer != null ) {
observer.update( null, file );
}
}
catch( final Throwable t ) {
System.err.println( file.getPath());
t.printStackTrace();
}
}
return done;
}
public static void main( String[] args ) {
if( args.length != 2 ) {
System.err.println(
"usage: java -jar pdf2img.jar <PDF directory> <JPEG directory>" );
System.exit(1);
}
final File pdfDir = new File( args[0] );
final File jpegDir = new File( args[1] );
final File[] files = pdfDir.listFiles( new FilenameFilter() {
@Override public boolean accept( File dir, String name ) {
return name.toLowerCase().endsWith( ".pdf" );
}});
if( files != null ) {
final int done = extract( Arrays.asList( files ), jpegDir, null );
System.out.printf(
"\n%d file%s processed.", done, ( done > 1 ) ? "s" : "" );
}
}
}
This utility may be associated with a GUI (french localized):
public final class GUI extends Application {
@Override
public void start( Stage primaryStage ) throws Exception {
final BorderPane pane = new BorderPane();
final HBox topPane = new HBox();
final Label lbl = new Label( "Répertoire des images : " );
final TextField jpegDir = new TextField();
final Button browseBtn = new Button( "Parcourir..." );
final TableView< File > filesVw = new TableView<>();
lbl.setAlignment( Pos.CENTER_LEFT );
lbl .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
jpegDir .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
browseBtn.setStyle( "-fx-padding:8px; -fx-margin:8px;" );
topPane.getChildren().addAll( lbl, jpegDir, browseBtn );
pane.setTop( topPane );
pane.setCenter( filesVw );
jpegDir.setPrefColumnCount( 40 );
jpegDir.setEditable( false );
final ObservableList< TableColumn< File, ? >> columns = filesVw.getColumns();
final TableColumn< File, String > name = new TableColumn<>( "Nom" );
name.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty( p.getValue().getName()); }});
name.setSortable( false );
name.setPrefWidth( 400 );
columns.add( name );
final TableColumn< File, String > size = new TableColumn<>( "Taille" );
size.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty( String.format( "%,12d", p.getValue().length())); }});
size.setSortable( true );
size.setPrefWidth( 80 );
columns.add( size );
final TableColumn< File, String > date = new TableColumn<>( "Date" );
final SimpleDateFormat sdf = new SimpleDateFormat( "dd/MM/YYYY HH:mm" );
date.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty(
sdf.format( new Date( p.getValue().lastModified()))); }});
date.setSortable( true );
date.setPrefWidth( 120 );
columns.add( date );
final Map< File, SimpleBooleanProperty > dones = new HashMap<>();
final TableColumn< File, Boolean > done = new TableColumn<>( "Traité" );
done.setCellValueFactory(
new Callback< CellDataFeatures< File, Boolean >, ObservableValue< Boolean >>(){
@Override public ObservableValue< Boolean > call( CellDataFeatures< File, Boolean > p ){
return dones.get( p.getValue()); }});
done.setCellFactory(
new Callback<TableColumn<File,Boolean>,TableCell<File,Boolean>>(){
@Override public TableCell<File,Boolean> call( TableColumn<File,Boolean> p ){
return new CheckBoxTableCell<>(); }});
done.setSortable( true );
done.setPrefWidth( 40 );
columns.add( done );
jpegDir.setOnDragOver(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
if (event.getGestureSource() != jpegDir ) {
event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
}
event.consume();
}});
jpegDir.setOnDragDropped(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
final Dragboard db = event.getDragboard();
boolean success = false;
if( db.hasFiles()) {
jpegDir.setText( db.getFiles().get( 0 ).getPath());
success = true;
}
event.setDropCompleted( success );
event.consume();
}});
filesVw.setOnDragOver(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
if( event.getGestureSource() != filesVw && ! jpegDir.getText().isEmpty()) {
event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
}
event.consume();
}});
filesVw.setOnDragDropped(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
final Dragboard db = event.getDragboard();
boolean success = false;
if( db.hasFiles()) {
final List< File > files = db.getFiles();
final File target = new File( jpegDir.getText());
for( final File f : files ) {
dones.put( f, new SimpleBooleanProperty( false ));
}
filesVw.getItems().addAll( files );
filesVw.setDisable( true );
new Thread(){@Override public void run() {
Main.extract(
files, target,
new Observer(){ @Override public void update( Observable o, final Object file ) {
Platform.runLater( new Runnable() { @Override public void run() {
dones.get( file ).setValue( Boolean.TRUE );
}});
}});
Platform.runLater( new Runnable() { @Override public void run() {
filesVw.setDisable( false );
}});
}}.start();
success = true;
}
event.setDropCompleted( success );
event.consume();
}});
primaryStage.setScene( new Scene( pane ));
primaryStage.setX( 0 );
primaryStage.setY( 0 );
primaryStage.show();
}
public static void main( String[] args ) {
launch();
}
}
Extract images from pdf file using PDFBox
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
Try jpedal, that will work. It can extract almost any type of objects (images, text..)
jpedal-Java developer library
PDFDecoder API from JPedal will help you extract the words.
// Decode the page
decodePdf.decodePage(page);
// Create the grouping object to apply grouping to the data
PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
// Bounding box for the whole page
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1 = currentPageData.getMediaBoxX(page);
int x2 = currentPageData.getMediaBoxWidth(page)+x1;
int y2 = currentPageData.getMediaBoxX(page);
int y1 = currentPageData.getMediaBoxHeight(page)-y2;
// Extract words
List words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, true, "&:=()!;.,\\/\"\"\'\'");
Now, iterate through the list for the words in PDF. Hope it works. Thanks!
精彩评论