Thursday, 9 March 2017

Convert HTML Tables To DataSet In C#

private DataSet ConvertHTMLTablesToDataSet(string HTML) {
        // Declarations
        DataSet ds = new DataSet();
        DataTable dt = null;
        DataRow dr = null;
        DataColumn dc = null;
        string TableExpression = "<table[^>]*>(.*?)</table>";
        string HeaderExpression = "<th[^>]*>(.*?)</th>";
        string RowExpression = "<tr[^>]*>(.*?)</tr>";
        string ColumnExpression = "<td[^>]*>(.*?)</td>";
        bool HeadersExist = false;
        int iCurrentColumn = 0;
        int iCurrentRow = 0;
        // Get a match for all the tables in the HTML
        MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
        // Loop through each table element
        foreach(Match Table in Tables) {
            // Reset the current row counter and the header flag
            iCurrentRow = 0;
            HeadersExist = false;
            // Add a new table to the DataSet
            dt = new DataTable();
            //Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)
            // if (Table.Value.Contains("<th"))
            if (Table.Value.Contains("<th")) {
                // Set the HeadersExist flag
                HeadersExist = true;
                // Get a match for all the rows in the table
                MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
                // Loop through each header element
                foreach(Match Header in Headers) {
            } else {
                for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++) {
                    dt.Columns.Add("Column " + iColumns);
            //Get a match for all the rows in the table
            MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
            // Loop through each row element
            foreach(Match Row in Rows) {
                    // Only loop through the row if it isn't a header row
                    if (!(iCurrentRow == 0 && HeadersExist)) {
                        // Create a new row and reset the current column counter
                        dr = dt.NewRow();
                        iCurrentColumn = 0;
                        // Get a match for all the columns in the row
                        MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
                        // Loop through each column element
                        foreach(Match Column in Columns) {
                                // Add the value to the DataRow
                                dr[iCurrentColumn] = Column.Groups[1].ToString();
                                // Increase the current column
                            // Add the DataRow to the DataTable
                    // Increase the current row counter
                // Add the DataTable to the DataSet
        return ds;

No comments:

Post a Comment

Get all non-clustered indexes

DECLARE cIX CURSOR FOR     SELECT OBJECT_NAME(SI.Object_ID), SI.Object_ID, SI.Name, SI.Index_ID         FROM Sys.Indexes SI             ...