SQL2008 has new SSIS Package task DataProfiler. That is really interesting task with great capabilities. It allows to make deep analisys of data. Here is quick overview of that task.
But when I say new it also means raw, bugged, incomplete, incosistent. It is hard to argue with Microsoft Connect support team. By design - is a tipical answer in many scenarios. I am not making complaints, just want to share with you my experience which you will never collect reading Microsoft Connect feedbacks.
So, the structure of the task is based on requests which you put inside. Currently, there are following requests:
- Candidate Key Profile
- Column length distribution
- Null Ration Profile
- Column Pattern Profile
- Column Statistics Profile
- Column Value Distribution Profile
- Functional Dependency Profile
The main question is how to use this tool to iterate through database tables and get profile fr each table? Yes, we have to use script task as it was advised by msdn here.
But when you run this code in flow with many big tables (in my case more than 200,000 rows) you may experience with memory leak exception. The other issue was “Primary Key Kandidate” request – not all fieald are processed by this request qccuratly. Sometimes it generates exception:
Error occurred when profiling the data. The error message is: System.ArgumentOutOfRangeException: Specified argument was out of the range of valid values…
The profile request “dd8232d8-1e6d-495a-8ca7-e724205ab449″ is invalid. Correct it or remove it to proceed. The detailed error is: Profile type of “CandidateKeyProfileRequest” is not applicable to column “TonnesRequired” of type “Float”.
That means some Datatypes are not allowed in this request and you have to rewright you script like this
/// Candidate Key Profile = 1
/// Column length distribution = 2
/// Null Ration Profile = 4
/// Column Pattern Profile = 8 //sometimes causes memory leaks for big tables
/// Column Statistics Profile = 16
/// Column Value Distribution Profile = 32
/// Functional Dependency Profile = 64 --Not implemented yet
int QueryMask = 1 + 2 + 4 + 8 + 16 + 32;
//if (obj.RowCount > ThresholdNumberOfRows) QueryMask = QueryMask - 8;
StringCollection ColumnNames = new StringCollection();
foreach (Column column in obj.Columns)
{
if ((column.DataType.SqlDataType != SqlDataType.Float)
&& (column.DataType.SqlDataType != SqlDataType.Decimal)
&& (column.DataType.SqlDataType != SqlDataType.Numeric)
&& (column.DataType.SqlDataType != SqlDataType.Money)
&& (column.DataType.SqlDataType != SqlDataType.SmallMoney)
&& (column.DataType.SqlDataType != SqlDataType.Real)
&& (column.DataType.SqlDataType != SqlDataType.UniqueIdentifier)
&& (column.DataType.SqlDataType != SqlDataType.Binary)
&& (column.DataType.SqlDataType != SqlDataType.Xml)
&& (column.DataType.SqlDataType != SqlDataType.Timestamp)
&& (column.DataType.SqlDataType != SqlDataType.Geography)
&& (column.DataType.SqlDataType != SqlDataType.Geometry)
&& (column.DataType.SqlDataType != SqlDataType.Image)
)
{
ColumnNames.Add(column.Name);
}
}
string outputXml = DataProfilerWrapper.Profiler.GetDataProfile(DBConnectionString, obj.Schema, obj.Name, ColumnNames, QueryMask);
Now we need to implement GetDataProfile method:
///
/// QueryMask is a bitmaks
///
public static string GetDataProfile(string DBConnectionString, string SchemaName, string TableName, StringCollection ColumnNames, int QueryMask)
{
string outputXml = null;
try
{
// initialize SSIS application runtime
var app = new Microsoft.SqlServer.Dts.Runtime.Application();
// load the package
var pkg = app.LoadPackage(@”DataProfilerWrapperPackageTemplate.dtsx”, null);
// the package contains only one task. loading it.
var task = pkg.Executables[0] as TaskHost;
var dpt = task.InnerObject as DataProfilingTask;
// get connections
var connectionDB = pkg.Connections[0];
var connectionTagDB = pkg.Connections[1];
// connection string for SQL Server. assuming that we’re using localhost.
pkg.Variables["DBConnectionString"].Value = DBConnectionString;
// connection string for TagDB (see ColumnPatternProfileRequest)
pkg.Variables["TagDBConnectionString"].Value = “”;
#region create new Candidate Key Profile query
if ((QueryMask & 1 ) == 1 )
{
var keyprofileRequest = new Microsoft.DataDebugger.DataProfiling.CandidateKeyProfileRequest();
// set the id for request
keyprofileRequest.RequestID = Guid.NewGuid().ToString();
// set the database id
keyprofileRequest.DataSourceID = connectionDB.ID;
// select table
keyprofileRequest.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
foreach (string columnName in ColumnNames)
{
keyprofileRequest.KeyColumns.Add(new Microsoft.DataDebugger.DataProfiling.ColumnParameter(columnName));
}
// set profiling attributes
keyprofileRequest.KeyStrengthThreshold = 0.95;
keyprofileRequest.MaxNumberOfViolations = 100;
keyprofileRequest.ThresholdSetting = Microsoft.DataDebugger.DataProfiling.StrengthThresholdSetting.Specified;
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(keyprofileRequest);
}
#endregion
#region create new Column length distribution query
if ((QueryMask & 2 ) == 2 )
{
var collendistribution = new Microsoft.DataDebugger.DataProfiling.ColumnLengthDistributionProfileRequest();
// set the id for request
collendistribution.RequestID = Guid.NewGuid().ToString();
// set the database id
collendistribution.DataSourceID = connectionDB.ID;
// select table
collendistribution.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
collendistribution.Column = Microsoft.DataDebugger.DataProfiling.ColumnParameter.WildCard;
// or use a single column with
// keyprofileRequest.KeyColumns.Add(new Microsoft.DataDebugger.DataProfiling.ColumnParameter(“CurrencyAlternateKey”));
// set profiling attributes
collendistribution.IgnoreLeadingSpace = true;
collendistribution.IgnoreTrailingSpace = true;
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(collendistribution);
}
#endregion
#region create new Null Ration Profile query
if ((QueryMask & 4 ) == 4 )
{
var nullratioprofileRequest = new Microsoft.DataDebugger.DataProfiling.ColumnNullRatioProfileRequest();
// set the id for request
nullratioprofileRequest.RequestID = Guid.NewGuid().ToString();
// set the database id
nullratioprofileRequest.DataSourceID = connectionDB.ID;
// select table
nullratioprofileRequest.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
nullratioprofileRequest.Column = Microsoft.DataDebugger.DataProfiling.ColumnParameter.WildCard;
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(nullratioprofileRequest);
}
#endregion
#region create new Column Pattern Profile query
if ((QueryMask & 8 ) == 8 )
{
var patternprofileRequest = new Microsoft.DataDebugger.DataProfiling.ColumnPatternProfileRequest();
// set the id for request
patternprofileRequest.RequestID = Guid.NewGuid().ToString();
// set the database id
patternprofileRequest.DataSourceID = connectionDB.ID;
// select table
patternprofileRequest.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
patternprofileRequest.Column = Microsoft.DataDebugger.DataProfiling.ColumnParameter.WildCard;
patternprofileRequest.CaseSensitive = false;
//patternprofileRequest.Delimiters = “\t\r\n”;
//patternprofileRequest.Symbols = “,.;:-”‘`~=&/\\@!?()[]{}|#*^%”;
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(patternprofileRequest);
}
#endregion
#region create new Column Statistics Profile query
if ((QueryMask & 16 ) == 16 )
{
var columnStatisticsProfileRequest = new Microsoft.DataDebugger.DataProfiling.ColumnStatisticsProfileRequest();
// set the id for request
columnStatisticsProfileRequest.RequestID = Guid.NewGuid().ToString();
// set the database id
columnStatisticsProfileRequest.DataSourceID = connectionDB.ID;
// select table
columnStatisticsProfileRequest.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
columnStatisticsProfileRequest.Column = Microsoft.DataDebugger.DataProfiling.ColumnParameter.WildCard;
//columnStatisticsProfileRequest.AllowSqlDbType(System.Data.SqlDbType.Int);
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(columnStatisticsProfileRequest);
}
#endregion
#region create new Column Value Distribution Profile query
if ((QueryMask & 32 ) == 32 )
{
var columnValueDistributionProfileRequest = new Microsoft.DataDebugger.DataProfiling.ColumnValueDistributionProfileRequest();
// set the id for request
columnValueDistributionProfileRequest.RequestID = Guid.NewGuid().ToString();
// set the database id
columnValueDistributionProfileRequest.DataSourceID = connectionDB.ID;
// select table
columnValueDistributionProfileRequest.Table = new Microsoft.DataDebugger.DataProfiling.TableQName(SchemaName, TableName);
// add column(s)
columnValueDistributionProfileRequest.Column = Microsoft.DataDebugger.DataProfiling.ColumnParameter.WildCard;
//columnValueDistributionProfileRequest.AllowSqlDbType(System.Data.SqlDbType.Int);
//columnValueDistributionProfileRequest.FrequentValueThreshold = 0.001;
//columnValueDistributionProfileRequest.Option = Microsoft.DataDebugger.DataProfiling.ValueDistributionOptions.FrequentValues;
// add the request to request collection so it will be executed.
dpt.ProfileRequests.Add(columnValueDistributionProfileRequest);
}
#endregion
// execute package
var result = pkg.Execute();
if (result == DTSExecResult.Success)
{
outputXml = pkg.Variables["OutputXml"].Value.ToString();
}
else
{
//System.Console.WriteLine(“Package failed. Errors:”);
foreach (var error in pkg.Errors)
{
//System.Console.WriteLine(“{0}: {1}”, error.ErrorCode, error.Description);
}
}
}
catch (Exception ex)
{
//System.Console.WriteLine(“Error: {0}”, ex.Message);
}
finally
{
GC.Collect();
}
return outputXml;
}