SSIS Script Component code for stripping metadata from RFC822 output HTML. Before placing this in the script, add “html” as an output of type DT_NTEXT in the script component.
public override void Input0_ProcessInputRow(Input0Buffer Row)
{
var blobColumn = Row.description;
if (!blobColumn.IsNull)
{
var stringData = BlobColumnToString(blobColumn);
Regex regex = new Regex(@"(?i)(?=<html)[\S\s]+(?:<\/html>)");
Match match = regex.Match(stringData);
if (match.Success)
{
Row.html.AddBlobData(GetBytes(match.Value));
}
}
}
byte[] GetBytes(string str)
{
byte[] bytes = new byte[str.Length * sizeof(char)];
System.Buffer.BlockCopy(str.ToCharArray(), 0, bytes, 0, bytes.Length);
return bytes;
}
string BlobColumnToString(BlobColumn blobColumn)
{
if (blobColumn.IsNull)
return string.Empty;
var blobLength = Convert.ToInt32(blobColumn.Length);
var blobData = blobColumn.GetBlobData(0, blobLength);
var stringData = System.Text.Encoding.Unicode.GetString(blobData);
return stringData;
}
static string CleanInput(string strIn)
{
// Replace invalid characters with empty strings.
try
{
return Regex.Replace(strIn, @"[\n\t]+", "",
RegexOptions.None, TimeSpan.FromSeconds(1.5));
}
// If we timeout when replacing invalid characters,
// we should return Empty.
catch (RegexMatchTimeoutException)
{
return strIn;
}
}
No comment yet, add your voice below!