Table of Contents
Leveraging kernel structure process data of malicious software to train Logistic Regression based model for runtime classification and determination.
The first step was implementing a driver that would aggregate data from the Kernel Structure relevant to two common processes types such as CLI and GUI activity. The drivers were written in C and designed to monitor and extract kernel runtime activity of 11 different features that we determined would be useful for training the model. The features that were extracted were:
KERNEL FEATURE | FEATURE EXPLANTATION |
---|---|
map_count | Number of memory regions of a process |
hiwater_rss | Max number of page frames ever owned by the process |
hiwater_vm | Max number of pages appeared in memory region of the process |
total_vm | Size of process’s address space in terms of number of pages |
exec_vm | Number of pages in executable memory mappings of process |
utime | Tick count of a process that is executing in user mode |
stime | Tick count of a process in the kernel mode |
nvcsw | Number of volunteer context switches |
nivcsw | Number of in-volunteer context switches |
min_flt | Contains the minor page faults |
fs.count | number of file usage |
Here I have included a sample of the code for the driver that demonstrates how we extract the runtime kernel activity and place it into a buffer instead of writing directly to a file for greater performance.
static int extract_features(void*n)
{
//task_features tf;
// int pid_pname;
sample val;
int err = 0;
struct task_struct *task;
int iterations;
int loop;
ktime_t startTime;
s64 timeTaken_us;
int delayAmtMin, delayAmtMax;
int found;
int len =0;
fileOpen = 1;
tf.pid[0]=0; tf.cnt=0;
iterations = dura / srate;
delayAmtMin = 100 * srate -5;
delayAmtMax = 100 * srate +5;
DBG("In extract_features");
// Allow the SIGKILL signal
allow_signal(SIGKILL);
while (!kthread_should_stop()) {
if (signal_pending(current)) break;
usleep_range(delayAmtMin, delayAmtMax);
if (sampling) {
found = 0;
for_each_process(task) {
// pr_info("%s [%d]\\n", task->comm, task->pid);
// uncomment for pid matching
//kstrtoint(ProcessName, 0, &pid_pname);
//if (task->pid == pid_pname) { found = 1; break; }
// uncomment for pname matching
if (!strcmp(task->comm, ProcessName)) { found = 1; break; }
}
//ssleep(1);
if (found) {
tf.pid[0]=0; tf.cnt=0;
DBG("In extract_features found process");
DBG("Number of iteration %i", iterations);
memset(&val, 0, sizeof(val));
startTime = ktime_get();
// sample at rate indicated for the number of iterations = duration /rate
for (loop = 0; loop < iterations; loop++) {
for_each_process(task) {
// pr_info("%s [%d]\\n", task->comm, task->pid);
// uncomment for pid matching
//kstrtoint(ProcessName, 0, &pid_pname);
//if (task->pid == pid_pname) {
// uncomment for pname matching
if (!strcmp(task->comm, ProcessName)) {
// fpu_counter - > uage counter floatin point units (not available since version linux 2.13)
// Memory related features
// map_count -> number of memory regions of a process
if ((task->active_mm)) {
val.map_count = (*task->active_mm).map_count;
val.hiwater_rss = (*task->active_mm).hiwater_rss;
val.hiwater_vm = (*task->active_mm).hiwater_vm;
val.total_vm = (*task->active_mm).total_vm;
val.exec_vm = (*task->active_mm).exec_vm;
}
val.utime = task->utime;
val.stime = task->stime;
val.nvcsw = task->nvcsw;
val.nivcsw = task->nivcsw;
val.min_flt = task->min_flt;
}
}
memcpy(&tf.pid[tf.cnt], &val, sizeof(val));
tf.cnt += sizeof(val);
memset(&val, 0, sizeof(val));
timeTaken_us = ktime_us_delta(ktime_get(), startTime);
if (timeTaken_us < delayAmtMin) {
usleep_range(delayAmtMin - timeTaken_us, delayAmtMax - timeTaken_us);
}
else {
DBG("exceeded by %llu on iteration: %i", (timeTaken_us-delayAmtMin), loop);
}
startTime = ktime_get();
}
// Open file to save samples
len = strlen(path);
strcpy(fullFileName, path);
if (fullFileName[len-1] != '/') strcat(fullFileName, "/");
strcat(fullFileName, ProcessName);
strcat(fullFileName, ".dat");
printk(KERN_INFO "File to Open: %s\\n", fullFileName);
filepath = fullFileName; // set for disk write code
// open file to save samples use path when driver started and set process name as file name .dat
DBG("Initilizing Dump...");
if((err = setup())) {
DBG("Setup Error");
cleanup();
}
if (!err) err = writeFormatData();
// close file if open
cleanup(); // close sample file
fileOpen = 0;
sampling = 0;
}
}
}
DBG("Leaving extract_features");
do_exit(0);
return 0;
}
// here is where features are extracted for the matching process and place in Buffer which will be printed
static int writeFormatData(void)
{
int err = 0;
int i, cnt, loop;
sample val;
char buffer[100];
char writeBuffer[500];
loop = tf.cnt/sizeof(val);
DBG("In writeFormatData");
DBG("cnt: %i", tf.cnt);
DBG("sample Size: %lu", sizeof(val));
DBG("loop count: %i", loop);
for (i=0; i< loop; i++) {
cnt = i * sizeof(val);
memcpy(&val, &tf.pid[cnt], sizeof(val));
// Memory related features
// map_count -> number of memory regions of a process
sprintf(buffer, "%d", val.map_count);
strcpy(writeBuffer, buffer); strcat(writeBuffer, ",");
// hiwater_rss -> Max number of page frames ever owned by the process
sprintf(buffer, "%lu", val.hiwater_rss);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// hiwater_vm -> Max number of pages appeared in memory region of process
sprintf(buffer, "%lu", val.hiwater_vm);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// total_vm -> Size of process's address space in terms of number of pages
sprintf(buffer, "%lu", val.total_vm);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// exec_vm -> number of pages in executable memory mappings of process
sprintf(buffer, "%lu", val.exec_vm);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// utime -> Tick count of a process that is executing in user mode
sprintf(buffer, "%ld", val.utime);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// stime -> Tick count of a process in the kernel mode
sprintf(buffer, "%ld", val.stime);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// nvcsw -> number of volunter context switches
sprintf(buffer, "%lu", val.nvcsw);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// nivcsw -> number of in-volunter context switches
sprintf(buffer, "%lu", val.nivcsw);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// min_flt -> Contains the minor page faults
sprintf(buffer, "%lu", val.min_flt);
strcat(writeBuffer, buffer); strcat(writeBuffer, ",");
// fs.count - > number of file usage (was count, now field called users)
sprintf(buffer, "%d", val.fscount);
strcat(writeBuffer, buffer);
strcat(writeBuffer, "\\n");
err = write_buffer(writeBuffer);
if (err) return err;
}
return err;
}
Using the driver we collected data for the two common process types being CLI and GUI processes. The Command Line Interfaces (CLI) processes that we chose were: ls, ps, and ipconfig. The Graphic User Interface (GUI) processes that we chose were gedit and firefox.
The data was then processed for use in training and developing our machine learning model using Logistic Regression.
Finally, the model was trained using the Waikato Environment for Knowledge Analysis (WEKA) using the classifier model j4.8. This logistic regression based model leveraged 10-fold cross-validation in the testing of the model.
WEKA helps data-driven organizations seamlessly and sustainably store, process, and manage data virtually anywhere with cloud simplicity and on-prem performance. Our software-defined, cloud-native data platform turns stagnant data silos into streaming data pipelines that fuel next-generation workloads like AI and HPC. - WEKA