Source code for url2features.simple

# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import string
import math
import os
import re

from .process import load_word_list

"""
    url2features.simple: Basic text feature calculation.

    Calculate simple informative statistics about the URL.
    Such as the inferred depth using slash chars, presence of dates or
    indicators of common CMS 
"""

########################################################################################
[docs]def simple_features(df, columns, add_prefix=True): """ Given a pandas dataframe and a set of column names. calculate the simple text summary features and add them. """ rez = df.copy() for col in columns: rez = add_simple_features(rez, col, add_prefix) return rez
########################################################################################
[docs]def add_simple_features(df, col, add_prefix=True): """ Given a pandas dataframe and a column name. calculate the simple features """ count = lambda l1,l2: sum([1 for x in l1 if x in l2]) def simp_feats(x, col): if x[col]!=x[col]: length = -1 punct = -1 numeric = -1 capital = -1 depth = -1 else: length = len(x[col]) punct = count(x[col], string.punctuation)/length numeric = count(x[col], string.digits)/length capital = sum(1 for c in x[col] if c.isupper())/length depth = null_tolerant_depth(x[col]) return length, punct, numeric, capital, depth if add_prefix: col_names = [col+"_length", col+"_punct", col+"_numeric", col+"_capital", col+"_path_depth"] else: col_names = ["url_length","url_punct", "url_numeric", "url_capital", "path_depth"] df[ col_names ] = df.apply(simp_feats, col=col, axis=1, result_type="expand") return df
########################################################################################
[docs]def null_tolerant_len(x): if x != x: return 0 else: return len(x)
########################################################################################
[docs]def null_tolerant_depth(x): if x != x: return 0 else: x = remove_protocol_and_trim(x) return len( str(x).split("/") )
########################################################################################
[docs]def remove_protocol_and_trim(url): p = re.findall(r"^https?://", url) if len(p) > 0: url = url[len(p[0]):] e = re.findall(r"/\s*$", url) if len(e) > 0: url = url[:0-len(e[0])] return url