Source code for url2features.simple

# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import string
import math
import os
import re

from .process import load_word_list

"""
    url2features.simple: Basic text feature calculation.

    Calculate simple informative statistics about the URL.
    Such as the inferred depth using slash chars, presence of dates or
    indicators of common CMS 
"""

########################################################################################
[docs]def simple_features(df, columns, add_prefix=True):
    """
        Given a pandas dataframe and a set of column names.
        calculate the simple text summary features and add them.
    """
    rez = df.copy()
    for col in columns:
        rez = add_simple_features(rez, col, add_prefix)
    return rez

########################################################################################
[docs]def add_simple_features(df, col, add_prefix=True):
    """
        Given a pandas dataframe and a column name.
        calculate the simple features 
    """

    count = lambda l1,l2: sum([1 for x in l1 if x in l2])

    def simp_feats(x, col):
        if x[col]!=x[col]:
            length = -1
            punct = -1
            numeric = -1
            capital = -1
            depth = -1
        else:
            length = len(x[col])
            punct = count(x[col], string.punctuation)/length
            numeric = count(x[col], string.digits)/length
            capital = sum(1 for c in x[col] if c.isupper())/length
            depth = null_tolerant_depth(x[col])
        return length, punct, numeric, capital, depth

    if add_prefix:
        col_names = [col+"_length", col+"_punct", col+"_numeric", col+"_capital", col+"_path_depth"]
    else:
        col_names = ["url_length","url_punct", "url_numeric", "url_capital", "path_depth"]

    df[ col_names ] = df.apply(simp_feats, col=col, axis=1, result_type="expand")

    return df

########################################################################################

[docs]def null_tolerant_len(x):
    if x != x:
        return 0
    else:
        return len(x)

########################################################################################

[docs]def null_tolerant_depth(x):
    if x != x:
        return 0
    else:
        x = remove_protocol_and_trim(x)
        return len( str(x).split("/") )

########################################################################################

[docs]def remove_protocol_and_trim(url):
    p = re.findall(r"^https?://", url)
    if len(p) > 0:
        url = url[len(p[0]):]
    e = re.findall(r"/\s*$", url)
    if len(e) > 0:
        url = url[:0-len(e[0])]
    return url