Feb. 19th, 2013
дас ист фантастиш
Feb. 19th, 2013 10:34 am
try
{
if (header.startsWith( PDF_HEADER ))
{
float pdfVersion = Float. parseFloat(
header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) );
document.setVersion( pdfVersion );
}
else
{
float pdfVersion = Float. parseFloat(
header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) );
document.setVersion( pdfVersion );
}
}
catch ( NumberFormatException e )
{
throw new IOException( "Error getting pdf version:" + e );
}
(from pdfbox library)
а вот и мой кот
Feb. 19th, 2013 11:57 pm
def extractTextFromImagesHiddenInPdf(pdf: File): String = {
import OS._
val fName = {
val n = pdf.getName
n.substring(0, n lastIndexOf '.')
}
def withExtension(x: String) = tempFile(fName, x)
val png = withExtension("png")
val tif = withExtension("tif")
val noname = withExtension("")
val txt = withExtension("txt")
def exec(cmd: String) = Runtime.getRuntime.exec(cmd)
def oops(txt: String, args: AnyRef*) = throw new IllegalStateException(txt.format(args:_*))
exec("convert -density 600 %s -monochrome %s".format(pdf, png))
val firstPage = if (png.exists) png else tempFile(fName + "-0", "png")
if (!firstPage.exists) oops("Failed to create %s or %s", png, firstPage)
exec("convert %s %s".format(firstPage, tif))
if (!tif.exists) oops("Failed to create %s from %s", tif, firstPage)
exec("tesseract %s %s".format(tif, noname))
if (!txt.exists) oops("Failed to create %s from %s", txt, tif)
Source.fromFile(txt).mkString
}
(и в продакшен)
