EVO PDF Logo

PDF to Text Component

EVO PDF Client for .NET Core

EVO PDF client library allows you to easily convert in just a few lines of code PDF documents to Text documents. The PDF to Text Converter object of PdfToTextConverter type can be initialized with the TCP/IP address of the server or with the HTTP URL address of the server, function of the EVO PDF Server type you have installed.

PDF to Text Converter Options

The PDF to Text Converter allows you select the page range to convert and to specify if the resulted text document preserves the original text layout from PDF or is optimized for reading. You can also mark the page breaks with a special character in the resulted text document. These features of the PDF to Text converter are exemplified in the code sample below. The full Visual Studio demo project for ASP.NET Core is available in product package you can download from website.

Code Sample - Convert PDF to Text in ASP.NET with PdfToTextConverter Class

C#
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Mvc;

using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Hosting;
using System.Text;

using EvoPdfClient;

namespace PdfToTextDemo.Controllers
{
    public class Getting_StartedController : Controller
    {
        private readonly IWebHostEnvironment m_hostingEnvironment;
        public Getting_StartedController(IWebHostEnvironment hostingEnvironment)
        {
            m_hostingEnvironment = hostingEnvironment;
        }

        public IActionResult Index()
        {
            ViewData["DemoFilePath"] = m_hostingEnvironment.ContentRootPath + "/wwwroot" + "/DemoFiles/Input/Demo.pdf";

            return View();
        }

        [HttpPost]
        public ActionResult ConvertPdfToText(IFormCollection collection)
        {
            // Get the server options
            string serverIP = collection["textBoxServerIP"];
            uint serverPort = uint.Parse(collection["textBoxServerPort"]);
            string servicePassword = collection["textBoxServicePassword"];
            bool useServicePassword = servicePassword.Length > 0;
            bool useTcpService = collection["ServerType"] == "radioButtonUseTcpService";
            string webServiceUrl = collection["textBoxWebServiceUrl"];

            // the pdf file to convert
            string pdfFilePath = collection["filePathTextBox"][0].Trim();
            if (pdfFilePath.Equals(String.Empty))
                throw new Exception("Please choose a PDF file to convert");

            // start page number
            int startPageNumber = int.Parse(collection["startPageTextBox"][0].Trim());
            // end page number
            // when it is 0 the extraction will continue up to the end of document
            int endPageNumber = 0;
            if (collection["endPageTextBox"][0].Trim() != String.Empty)
                endPageNumber = int.Parse(collection["endPageTextBox"][0].Trim());

            // the output text layout
            TextLayout textLayout = SelectedTextLayout(collection["textLayoutDropDownList"]);

            // the output text encoding
            System.Text.Encoding textEncoding = SelectedTextEncoding(collection["textEncodingDropDownList"]);

            // page breaks
            bool markPageBreaks = collection["markPageBreaksCheckBox"].Count > 0;

            string outputFileName = System.IO.Path.GetFileNameWithoutExtension(pdfFilePath) + ".txt";

            // Create the PDF to Text converter object
            PdfToTextConverter pdfToTextConverter = null;
            if (useTcpService)
                pdfToTextConverter = new PdfToTextConverter(serverIP, serverPort);
            else
                pdfToTextConverter = new PdfToTextConverter(true, webServiceUrl);

            // Set optional service password
            if (useServicePassword)
                pdfToTextConverter.ServicePassword = servicePassword;

            pdfToTextConverter.LicenseKey = "ujQlNSAgNSU1IzslNSYkOyQnOywsLCw1JQ==";

            pdfToTextConverter.Layout = textLayout;
            pdfToTextConverter.MarkPageBreaks = markPageBreaks;

            string extractedText = null;
            try
            {
                // read the PDF file in a memory buffer
                byte[] sourcePdfBytes = System.IO.File.ReadAllBytes(pdfFilePath);

                // extract text from PDF
                extractedText = pdfToTextConverter.ConvertToText(sourcePdfBytes, startPageNumber, endPageNumber);
            }
            catch (Exception ex)
            {
                throw new Exception(String.Format("An error occurred. {0}", ex.Message));
            }

            // get the UTF8 representation of the resulted text
            byte[] utf8TextBytes = Encoding.UTF8.GetBytes(extractedText);
            byte[] utf8MarkBytes = new byte[] { 0xEF, 0xBB, 0xBF };

            // copy the marker and text bytes in output bufffer
            byte[] bytes = new byte[utf8TextBytes.Length + utf8MarkBytes.Length];
            Array.Copy(utf8MarkBytes, 0, bytes, 0, utf8MarkBytes.Length);
            Array.Copy(utf8TextBytes, 0, bytes, utf8MarkBytes.Length, utf8TextBytes.Length);

            FileResult fileResult = new FileContentResult(bytes, "text/plain; charset=UTF-8");
            fileResult.FileDownloadName = outputFileName;

            return fileResult;
        }

        private TextLayout SelectedTextLayout(string textLayout)
        {
            switch (textLayout)
            {
                case "Original":
                    return TextLayout.OriginalLayout;
                case "Reading":
                    return TextLayout.ReadingLayout;
                case "Table Mode":
                    return TextLayout.TableModeLayout;
                case "Internal Order":
                    return TextLayout.PdfInternalOrderLayout;
                default:
                    return TextLayout.OriginalLayout;
            }
        }

        private Encoding SelectedTextEncoding(string textEncoding)
        {
            switch (textEncoding)
            {
                case "UTF-8":
                    return Encoding.UTF8;
                case "ISO-8859-1":
                    return Encoding.GetEncoding("iso-8859-1");
                case "ASCII":
                    return Encoding.ASCII;
                default:
                    return Encoding.UTF8;
            }
        }
    }
}