Source code for url2features.featurize

# -*- coding: utf-8 -*-
import functools
import pandas as pd 
import numpy as np

from .process import start_profile
from .process import end_profile
from .simple import simple_features
from .protocol import protocol_features
from .host import host_features
from .tld import tld_features
from .file import file_features
from .path import path_features
from .params import params_features
from .dns import dns_features

"""
    url2features.featurize: Core functions to apply a set of features to a data frame.
"""
########################################################################################

[docs]def process_df(df, params): """ process_df: Function that co-ordinates the process of generating the features """ add_prefix = params["prefix"] if params["simple"] : start_profile("simple") df = simple_features( df, params["columns"], add_prefix ) end_profile("simple") if params["protocol"] : start_profile("protocol") df = protocol_features( df, params["columns"], add_prefix ) end_profile("protocol") if params["host"] : start_profile("host") df = host_features( df, params["columns"], add_prefix ) end_profile("host") if params["tld"] : start_profile("tld") df = tld_features( df, params["columns"], add_prefix ) end_profile("tld") if params["path"] : start_profile("path") df = path_features( df, params["columns"], add_prefix ) end_profile("path") if params["file"] : start_profile("file") df = file_features( df, params["columns"], add_prefix ) end_profile("file") if params["params"] : start_profile("params") df = params_features( df, params["columns"], add_prefix ) end_profile("params") if params["dns"] : start_profile("dns") df = dns_features( df, params["columns"], add_prefix ) end_profile("dns") return df
########################################################################################
[docs]def generate_feature_function(parameters): """ This function will take the processed command line arguments that determine the feature to apply and partially apply them to the process_df function. Returning a function that can be used to apply those parameters to multiple chunks of a dataframe. """ return functools.partial(process_df, params = parameters)