Hello,
We’ve had fairly consistent issues with converting .eml files to .pdf where text gets cut-off on the right hand of the screen, the below email cuts off the signature to the right, for example.
LongSignature.zip (7.0 KB)
We see this in quite a few places, in the body (tables especially tend to cause this)
We’re using the most recent version of conversion, and would like an easy way to force wordwrap or something similar
@JacobBodmer
Thank you for attaching sample file as it will help to reproduce the issue faster. While we work on the issue the following workaround can be implemented: conversion in 3 steps where
- Convert EML to HTML
- Tweak HTML to wrap words
- Convert HTML to PDF
The app: workaround.zip (65.6 KB) and the code that also fixes image position in the final PDF:
using System;
using System.IO;
using System.Reflection;
using GroupDocs.Conversion;
using GroupDocs.Conversion.Options.Convert;
using GroupDocs.Conversion.Options.Load;
using GroupDocs.Conversion.Options;
namespace SampleApp
{
static class Program
{
static void Main()
{
ConvertEmlToPdf("LongSignature.eml", "LongSignature.pdf");
}
private static void ConvertEmlToPdf(string src, string dst)
{
// Step 1: Convert .eml to .html in memory
byte[] rawHtmlBytes = null;
using (Converter converter = new Converter(src))
{
converter.Convert(new WebConvertOptions(), (ConvertedContext ctx) =>
{
using (var ms = new MemoryStream())
{
ctx.ConvertedStream.CopyTo(ms);
rawHtmlBytes = ms.ToArray();
}
});
}
// Step 2: Tweak HTML to fix word wrapping and layout
byte[] htmlBytes = TweakHtml(rawHtmlBytes);
// Step 3: Convert tweaked .html to .pdf
using (Converter converter = new Converter(
() => new MemoryStream(htmlBytes),
(LoadContext ctx) => new WebLoadOptions()))
{
converter.Convert(dst, new PdfConvertOptions());
}
}
/// <summary>
/// Applies word-wrap fixes and layout corrections to the intermediate HTML.
/// </summary>
private static byte[] TweakHtml(byte[] rawHtml)
{
string html = System.Text.Encoding.UTF8.GetString(rawHtml);
html = html.Replace(" ", " ");
html = FixWhiteSpaceForLongContent(html);
html = FlattenImageCells(html);
string css = "<style>img { display: inline !important; vertical-align: middle !important; }</style>";
if (html.Contains("<head>", StringComparison.OrdinalIgnoreCase))
{
int headIndex = html.IndexOf("<head>", StringComparison.OrdinalIgnoreCase) + "<head>".Length;
html = html.Insert(headIndex, "\n" + css);
}
else
{
html = css + "\n" + html;
}
return System.Text.Encoding.UTF8.GetBytes(html);
}
private static string FixWhiteSpaceForLongContent(string html)
{
const string marker = "white-space:nowrap";
const int textThreshold = 200;
int pos = 0;
while ((pos = html.IndexOf(marker, pos)) >= 0)
{
int tagStart = html.LastIndexOf('<', pos);
int tagNameEnd = html.IndexOfAny(new[] { ' ', '>', '/' }, tagStart + 1);
string tagName = html.Substring(tagStart + 1, tagNameEnd - tagStart - 1).ToLower();
string closeTag = "</" + tagName;
int closePos = html.IndexOf(closeTag, pos, StringComparison.OrdinalIgnoreCase);
if (closePos < 0)
{
pos += marker.Length;
continue;
}
string content = html.Substring(pos, closePos - pos);
string plainText = System.Text.RegularExpressions.Regex.Replace(content, "<[^>]+>", "");
if (plainText.Length > textThreshold)
{
html = html.Remove(pos, marker.Length).Insert(pos, "white-space:normal");
}
pos += marker.Length;
}
return html;
}
private static string FlattenImageCells(string html)
{
string pattern = @"</td><td[^>]*font-size\s*:\s*0[^>]*>"
+ @"<table[^>]*><tr[^>]*><td[^>]*>"
+ @"(<a\s[^>]*><img\s[^>]*(?:/>|>(?:</img>)?)</a>)"
+ @"</td></tr></table></td>";
return System.Text.RegularExpressions.Regex.Replace(
html, pattern,
m => " " + m.Groups[1].Value + "</td>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
}
}
Hope it helps