Source code for url2features.file

# -*- coding: utf-8 -*-
from urllib import parse
import pkg_resources
import pandas as pd
import numpy as np
import math
import os
import re


"""
    url2features.file: Features based on the file type.
"""
from .process import load_dictionary
from .process import add_protocol_if_missing

extension_types = load_dictionary('file_extensions.dat')

########################################################################################
[docs]def file_features(df, columns, add_prefix=True):
    """
        Given a pandas dataframe and a set of column names.
        calculate the file type summary features and add them.
    """
    rez = df.copy()
    for col in columns:
        rez = add_file_features(rez, col, add_prefix)
    return rez

[docs]def remove_extension(file_name):
    file_parts = file_name.split(".")
    if len(file_parts) > 1:
        return file_name[0:-(len(file_parts[-1])+1)]
    else:
        return file_name

########################################################################################
[docs]def add_file_features(df, col, add_prefix):
    """
        Given a pandas dataframe and a column name.
        calculate the file features 
    """
    def get_file_features(x, col):
        ext = ""
        type = ""
        existance = 0
        file_len = 0
        file_wds = 0
        file_wd_len = 0
        file_1st_wd_prefix = ""
        file_1st_wd = ""
        if x[col]==x[col]:
            url = add_protocol_if_missing(x[col])
            protocol, host, path, params, query, fragment = parse.urlparse(url.strip())
            sections = path.split("/")
            final_file = sections[len(sections)-1]
            file_len = len(final_file)
            file_1st_wd_prefix, file_1st_wd, file_wds, file_wd_len = extract_word_stats( remove_extension(final_file) )
            file_parts = final_file.split(".")
            if len(file_parts) > 1:
                ext = file_parts[len(file_parts)-1].lower()
                type = file_extension_lookup(ext)
                existance = 1
        return file_len, file_1st_wd_prefix, file_1st_wd, file_wds, file_wd_len, ext, type, existance

    if add_prefix:
        col_names = [ col+'_file_len', col+"_file_1st_wd_prefix", col+"_file_1st_wd", col+'_file_wd_count', col+'_file_wd_len', 
                      col+'_file_extn', col+'_file_extn_type', col+'_file_extn_exists', ] 
    else:
        col_names = [ 'file_len', "file_1st_wd_prefix", "file_1st_wd", 'file_wd_count', 'file_wd_len', 
                      'file_extn', 'file_extn_type', 'file_extn_exists' ]

    df[ col_names ] = df.apply(get_file_features, col=col, axis=1, result_type="expand")

    return df

########################################################################################
[docs]def extract_word_stats(path):
   wds = re.split("[-_/~]+", path)
   wd_len = np.mean([len(w) for w in wds])
   my_wds = [w for w in wds if len(w)>2]
   if len(my_wds)>0:
      fst_wd = my_wds[0]
   else:
      fst_wd = ""
   if len(fst_wd)>3:
      fst_wd_pre = fst_wd[0:3]
   else:
      fst_wd_pre = fst_wd

   return fst_wd_pre.lower(), fst_wd.lower(), sum([1 for w in wds if len(w)>2]), wd_len


########################################################################################
[docs]def file_extension_lookup(ext):
    """
        Given a file extension returns the type
    """
    if ext in extension_types:
        return extension_types[ext]
    else:
        return ""

########################################################################################
[docs]def file_extension_lookup_old(ext):
    """
        Given a file extension returns its frequency and type
    """
    if ext in extension_types:
       typer = extension_types[ext]
       if typer == "static":
           type=1
       elif typer == "dynamic":
           type=2
       elif typer == "media":
           type=3
       else:
           type=4
    else:
       type = -1

    return type