SSIS Script Component code for stripping metadata from RFC822 output HTML

SSIS Script Component code for stripping metadata from RFC822 output HTML. Before placing this in the script, add “html” as an output of type DT_NTEXT in the script component.

public override void Input0_ProcessInputRow(Input0Buffer Row)
    {
        var blobColumn = Row.description;
        if (!blobColumn.IsNull)
        {
            var stringData = BlobColumnToString(blobColumn);
            Regex regex = new Regex(@"(?i)(?=<html)[\S\s]+(?:<\/html>)");
            Match match = regex.Match(stringData);
            if (match.Success)
            {
                Row.html.AddBlobData(GetBytes(match.Value));
            }
        }
    }

    byte[] GetBytes(string str)
    {
        byte[] bytes = new byte[str.Length * sizeof(char)];
        System.Buffer.BlockCopy(str.ToCharArray(), 0, bytes, 0, bytes.Length);
        return bytes;
    }

    string BlobColumnToString(BlobColumn blobColumn)
    {
        if (blobColumn.IsNull)
            return string.Empty;

        var blobLength = Convert.ToInt32(blobColumn.Length);
        var blobData = blobColumn.GetBlobData(0, blobLength);
        var stringData = System.Text.Encoding.Unicode.GetString(blobData);

        return stringData;
    }
    static string CleanInput(string strIn)
    {
        // Replace invalid characters with empty strings.
        try
        {
            return Regex.Replace(strIn, @"[\n\t]+", "",
                                 RegexOptions.None, TimeSpan.FromSeconds(1.5));
        }
        // If we timeout when replacing invalid characters,
        // we should return Empty.
        catch (RegexMatchTimeoutException)
        {
            return strIn;
        }
    }
Filed under: Blog

No comment yet, add your voice below!


Add a Comment

Your email address will not be published.

Comment *
Name *
Email *
Website

This site uses Akismet to reduce spam. Learn how your comment data is processed.