Technical Projects

Table of Contents

Malware Detection - Kernel based Machine Learning

Leveraging kernel structure process data of malicious software to train Logistic Regression based model for runtime classification and determination.

The first step was implementing a driver that would aggregate data from the Kernel Structure relevant to two common processes types such as CLI and GUI activity. The drivers were written in C and designed to monitor and extract kernel runtime activity of 11 different features that we determined would be useful for training the model. The features that were extracted were:

KERNEL FEATURE	FEATURE EXPLANTATION
map_count	Number of memory regions of a process
hiwater_rss	Max number of page frames ever owned by the process
hiwater_vm	Max number of pages appeared in memory region of the process
total_vm	Size of process’s address space in terms of number of pages
exec_vm	Number of pages in executable memory mappings of process
utime	Tick count of a process that is executing in user mode
stime	Tick count of a process in the kernel mode
nvcsw	Number of volunteer context switches
nivcsw	Number of in-volunteer context switches
min_flt	Contains the minor page faults
fs.count	number of file usage

Here I have included a sample of the code for the driver that demonstrates how we extract the runtime kernel activity and place it into a buffer instead of writing directly to a file for greater performance.

static int extract_features(void*n)
{
   //task_features tf;
//   int pid_pname;
   sample val;
   int err = 0;
   struct task_struct *task; 
   int iterations;
   int loop;
   ktime_t startTime; 
   s64 timeTaken_us;
   int delayAmtMin, delayAmtMax;  
   int found;
   int len =0; 

   fileOpen = 1;   
 

   tf.pid[0]=0; tf.cnt=0;
   iterations = dura / srate;
   delayAmtMin = 100 * srate -5;
   delayAmtMax = 100 * srate +5;  

   DBG("In extract_features");
   
   // Allow the SIGKILL signal
   allow_signal(SIGKILL);

   while (!kthread_should_stop()) {
      if (signal_pending(current)) break;
        usleep_range(delayAmtMin, delayAmtMax);
      if (sampling) {
         found = 0;
         for_each_process(task) {
            //   pr_info("%s [%d]\\n", task->comm, task->pid);
         
            // uncomment for pid matching
            //kstrtoint(ProcessName, 0, &pid_pname);
            //if (task->pid == pid_pname) { found = 1; break; }

            // uncomment for pname matching
            if (!strcmp(task->comm, ProcessName)) { found = 1; break; }
        }

         //ssleep(1);           
         if (found) {
            tf.pid[0]=0; tf.cnt=0;
            DBG("In extract_features found process");
            DBG("Number of iteration %i", iterations);

            memset(&val, 0, sizeof(val));
            startTime = ktime_get();

            // sample at rate indicated for the number of iterations = duration /rate
            for (loop = 0; loop < iterations; loop++) {
               for_each_process(task) {
                  //   pr_info("%s [%d]\\n", task->comm, task->pid);
 
                  // uncomment for pid matching
                  //kstrtoint(ProcessName, 0, &pid_pname);
                  //if (task->pid == pid_pname) {

                  // uncomment for pname matching
                  if (!strcmp(task->comm, ProcessName)) {
                     // fpu_counter - > uage counter floatin point units (not available since version linux 2.13)
 
                     // Memory related features
                     // map_count -> number of memory regions of a process
                     if ((task->active_mm)) {
                        val.map_count = (*task->active_mm).map_count;
                        val.hiwater_rss = (*task->active_mm).hiwater_rss; 
                        val.hiwater_vm = (*task->active_mm).hiwater_vm;
                        val.total_vm = (*task->active_mm).total_vm;
                        val.exec_vm = (*task->active_mm).exec_vm;
                        
                     }  
                     val.utime = task->utime;
                     val.stime = task->stime;
                     val.nvcsw = task->nvcsw;
                     val.nivcsw = task->nivcsw;
                     val.min_flt = task->min_flt;
                  }
               }  

               memcpy(&tf.pid[tf.cnt], &val, sizeof(val));
               tf.cnt += sizeof(val);
               memset(&val, 0, sizeof(val));

              timeTaken_us = ktime_us_delta(ktime_get(), startTime);
               if (timeTaken_us < delayAmtMin) {
                  usleep_range(delayAmtMin - timeTaken_us, delayAmtMax - timeTaken_us);
               }
               else {
                  DBG("exceeded by %llu on iteration: %i", (timeTaken_us-delayAmtMin), loop);
               }
               startTime = ktime_get();
            }

            // Open file to save samples
            len = strlen(path);	
            strcpy(fullFileName, path);
            if (fullFileName[len-1] != '/') strcat(fullFileName, "/");
            strcat(fullFileName, ProcessName);
            strcat(fullFileName, ".dat");
            printk(KERN_INFO "File to Open: %s\\n", fullFileName);
            filepath = fullFileName; // set for disk write code
 
            // open file to save samples use path when driver started and set process name as file name .dat
            DBG("Initilizing Dump...");
            if((err = setup())) {
               DBG("Setup Error");
               cleanup();
            }

            if (!err) err = writeFormatData();

            // close file if open
            cleanup();  // close sample file
  
            fileOpen = 0;
            sampling = 0;
         }
      }
   }
 
   DBG("Leaving extract_features");

   do_exit(0);
   return 0;
}

// here is where features are extracted for the matching process and place in Buffer which will be printed 

static int writeFormatData(void)
{
   int err = 0;
   int i, cnt, loop;
   sample val;
   char buffer[100];  
   char writeBuffer[500];

   loop = tf.cnt/sizeof(val);
   DBG("In writeFormatData");
   DBG("cnt: %i", tf.cnt);
   DBG("sample Size: %lu", sizeof(val));
   DBG("loop count: %i", loop);
   

   for (i=0; i< loop; i++) {
      cnt = i * sizeof(val);
      memcpy(&val, &tf.pid[cnt], sizeof(val));
   
      // Memory related features
      // map_count -> number of memory regions of a process
      sprintf(buffer, "%d", val.map_count);
      strcpy(writeBuffer, buffer); strcat(writeBuffer, ",");

      // hiwater_rss -> Max number of page frames ever owned by the process
      sprintf(buffer, "%lu", val.hiwater_rss);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // hiwater_vm -> Max number of pages appeared in memory region of process
      sprintf(buffer, "%lu", val.hiwater_vm);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // total_vm -> Size of process's address space in terms of number of pages
      sprintf(buffer, "%lu", val.total_vm);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // exec_vm -> number of pages in executable memory mappings of process
      sprintf(buffer, "%lu", val.exec_vm);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // utime -> Tick count of a process that is executing in user mode
      sprintf(buffer, "%ld", val.utime);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // stime -> Tick count of a process in the kernel mode
      sprintf(buffer, "%ld", val.stime);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // nvcsw -> number of volunter context switches
      sprintf(buffer, "%lu", val.nvcsw);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // nivcsw -> number of in-volunter context switches
      sprintf(buffer, "%lu", val.nivcsw);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // min_flt -> Contains the minor page faults
      sprintf(buffer, "%lu", val.min_flt);
      strcat(writeBuffer, buffer); strcat(writeBuffer, ",");

      // fs.count - > number of file usage (was count, now field called users)
      sprintf(buffer, "%d", val.fscount);
      strcat(writeBuffer, buffer);
      strcat(writeBuffer, "\\n");
       
      err = write_buffer(writeBuffer);
      if (err) return err;

   }
   return err;
}

Using the driver we collected data for the two common process types being CLI and GUI processes. The Command Line Interfaces (CLI) processes that we chose were: ls, ps, and ipconfig. The Graphic User Interface (GUI) processes that we chose were gedit and firefox.

The data was then processed for use in training and developing our machine learning model using Logistic Regression.

Screenshot 2023-02-23 185413.jpg

Finally, the model was trained using the Waikato Environment for Knowledge Analysis (WEKA) using the classifier model j4.8. This logistic regression based model leveraged 10-fold cross-validation in the testing of the model.

WEKA helps data-driven organizations seamlessly and sustainably store, process, and manage data virtually anywhere with cloud simplicity and on-prem performance. Our software-defined, cloud-native data platform turns stagnant data silos into streaming data pipelines that fuel next-generation workloads like AI and HPC. - WEKA

Screenshot 2022-09-21 173332.jpg