Hello all,
I have a pdf with arabic text (here)
now I want to extract the text from the pdf using groupdocs.parser V(19.5)
the pdf is stored in my database as a byte[], below is my code for parsing the pdf with arabic text and storing the text in my database.
StringBuilder strTextData = new StringBuilder();
System.IO.MemoryStream mTestStream = new MemoryStream(fileBytes);
{
strTextData.Append((ExtractTextAll(mTestStream, false) ?? "").Trim();
}
private string ExtractTextAll(MemoryStream stream, bool formatted)
{
stream.Seek(0, SeekOrigin.Begin);
GroupDocs.Parser.ExtractorFactory factory = new GroupDocs.Parser.ExtractorFactory();
GroupDocs.Parser.Extractors.Text.TextExtractor extractor = formatted
? factory.CreateFormattedTextExtractor(stream)
: factory.CreateTextExtractor(stream);
if (extractor == null)
{
return null;
}
try
{
return extractor.ExtractAll();
}
finally
{
extractor.Dispose();
}
}
using (SqlConnection conn = new SqlConnection(Program.GetConnectionString()))
{
conn.Open();
string QUERY = @"UPDATE [dbo].[FileTextData] SET [TextData]=@TextData, [ModifiedDateTime]=GETDATE() WHERE FileVersionId=@FileVersionId AND [PageIndex]=0
IF @@ROWCOUNT <= 0
BEGIN
INSERT INTO [dbo].[FileTextData] ([FileVersionId], [PageIndex], [TextData], [CreatedDateTime], [ModifiedDateTime])
VALUES(@FileVersionId, 0, @TextData, GETDATE(), GETDATE())
END
UPDATE [dbo].[FileVersion] SET IsTextDataExtracted='Y' WHERE Id=@FileVersionId
";
using (SqlCommand cmd = new SqlCommand(QUERY, conn))
{
cmd.Parameters.Add("@FileVersionId", SqlDbType.BigInt, 0).Value = fileVersionId;
cmd.Parameters.Add("@TextData", SqlDbType.NVarChar, -1).Value = strTextData.ToString();
WriteToLog("Updating database ...");
cmd.ExecuteNonQuery();
WriteToLog("Updating database Completed...");
}
}
but the problem is arabic text is stored in my database in a reverse order,
for e.g (دائرة بلدية أبوظضبي) this text is converted into (يبضظوبأ ةيدلب ةرئاد) text and I have to use this website whenever I want to search content of my pdf’s.
how can I fix this?
NOTE: If I download the pdf or view the pdf(using groupdocs viewer) from my db(byte[]) like below
return File(fileBytes, System.Net.Mime.MediaTypeNames.Application.Octet, Path.GetFileName(data.FileName));//for downloading
the text in the pdf is appearing correctly.