Source code for url2features.process

# -*- coding: utf-8 -*-
from __future__ import print_function
from io import StringIO
import datetime as dt
import pkg_resources
import pandas as pd 
import numpy as np
import json
import codecs
import sys
import os
import re

from .config import max_filesize

"""
    url2features.process: Support functions for the url2features package.
     Including functions for loading word lists, logging processing times,
     and iteratively processing a large dataset in chunks.
"""
########################################################################################
resource_package = __name__

[docs]def load_file(filename):
    """
    Utility function to load a raw data file
    """
    _path = '/'.join(('data', filename))
    rawd = pkg_resources.resource_string(resource_package, _path).decode("utf-8")
    return rawd

########################################################################################
[docs]def load_dictionary(filename, escape=False):
    """
    Utility function to load a json serialised dictionary
    """
    _path = '/'.join(('data', filename))
    rawd = pkg_resources.resource_string(resource_package, _path).decode("utf-8")
    if escape:
        rawd = re.escape(rawd)
    return json.loads(str(rawd)) 

########################################################################################
[docs]def load_word_list(filename, escape=False):
    """
    Utility function to load topic vocab word lists for pattern matching.
    """
    _path = '/'.join(('data', filename))
    rawd = pkg_resources.resource_string(resource_package, _path).decode("utf-8")
    rawd = rawd[:-1]
    if escape:
        rawd = re.escape(rawd)
    word_list = str(rawd).split("\n")
    _list = [i for i in word_list if i]
    return _list

########################################################################################

[docs]def load_word_pattern(filename, prefix="", pluralize=True, bound=True, escape=False):
    word_list = load_word_list(filename, escape=escape)
    if bound:
       delimiter = "\\b"
    else:
       delimiter = ""
    pattern_start = prefix + delimiter
    if pluralize:
        tail = "s*" + delimiter
        joiner = "s*" + delimiter + "|" + delimiter
    else:
        tail = delimiter
        joiner = delimiter + "|" + delimiter

    pattern = pattern_start  + ( joiner.join(word_list) ) + tail
    return pattern

########################################################################################
"""
   This is a set of functions to allow the application to print time
   profiles of the various feature engines to STDERR.
"""

[docs]def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

profiles = {}

[docs]def initialise_profile():
    profiles = {}

[docs]def start_profile(proc_name):
    n1=dt.datetime.now()
    if proc_name in profiles:
        profiles[proc_name]["start"] = n1
    else:
        profiles[proc_name] = {"start":n1}

[docs]def end_profile(proc_name):
    n2 = dt.datetime.now()
    n1 = profiles[proc_name]["start"]
    total = n2-n1
    profiles[proc_name]["end"] = n2
    if "total" in profiles[proc_name]:
        curr_total = profiles[proc_name]["total"]
        profiles[proc_name]["total"] = curr_total + total
    else:
        profiles[proc_name]["total"] = total

[docs]def print_profiles():
    eprint("Computation Time Profile for each Feature Set")
    eprint("---------------------------------------------")
    for k in profiles.keys():
        eprint(padded(k), str(profiles[k]["total"]) ) 

[docs]def padded(k, padto=20):
    spacer_len = padto - len(k)
    return k + (" "*spacer_len)

########################################################################################
[docs]def process_file_in_chunks(path_to_file, function_to_apply):
    """
        Given a path to a large dataset we will iteratively load it in chunks and 
        apply the supplied function to and write the result to the output stream.
    """
    fsize = os.stat(path_to_file).st_size
    sample_prop = max_filesize / fsize 
    line_count = count_lines(path_to_file)
    chunks = round(line_count * sample_prop)
    data_iterator = pd.read_csv(path_to_file, chunksize=chunks, low_memory=False)
    total_chunks = 0
    for index, chunk in enumerate(data_iterator, start=0):
        startpoint = 0 + (index*chunks)
        total_chunks = index + 1
        temp = function_to_apply(chunk)
        if total_chunks==1:
            print_output(temp, header=True)
        else:
            print_output(temp, header=False)
    
    eprint("Chunks processed: ", total_chunks)
 
########################################################################################
[docs]def print_output(df, header=True):
    output = StringIO()
    df.to_csv(output,index=False, header=header)
    print(output.getvalue(), end = '')

########################################################################################
[docs]def extract_file_extension(path_to_file):
    return os.path.splitext(path_to_file)[1]

########################################################################################
[docs]def load_complete_dataframe(path_to_file):
    """
        We load the entire dataset into memory, using the file extension to determine
        the expected format. We are using encoding='latin1' because it appears to 
        permit loading of the largest variety of files.
        Representation of strings may not be perfect, but is not important for generating a
        summarization of the entire dataset.
    """
    extension = extract_file_extension(path_to_file).lower()
    if extension == ".csv":
        df = pd.read_csv(path_to_file, encoding='latin1', low_memory=False)
        return df
    if extension == ".tsv":
        df = pd.read_csv(path_to_file, encoding='latin1', sep='\t', low_memory=False)
        return df
    if extension == ".xls" or extension == ".xlsx" or extension == ".odf" :
        df = pd.read_excel(path_to_file)
        return df

    raise ValueError("Unsupported File Type")

########################################################################################
[docs]def count_lines(path_to_file):
    """
    Return a count of total lines in a file. In a way that filesize is irrelevant
    """
    count = 0
    for line in open(path_to_file): count += 1
    return count

########################################################################################
[docs]def len_or_null(val):
    """ 
       Alternative len function that will simply return numpy.NA 
       for invalid values. This is needed to get sensible results 
       when running len over a column that may contain nulls
    """
    try:
        return len(val)
    except:
        return np.nan

########################################################################################
[docs]def isNaN(num):
    return num != num

########################################################################################
[docs]def remove_urls_and_tags(text):
    """
        Remove any obvious text elements that appear to be either 
        URLs or HTML tags
    """
    patterns = ["https?://[-._a-z0-9A-Z]*","</?[a-zA-Z]* ?[a-zA-Z'=.]* ?/?>","\\[tnrf]"]
    new_text = text
    for p in patterns:
        new_text = re.sub(p, ' ', new_text)
    return new_text

[docs]def remove_urls(text):
    pattern = "https?://[-._a-z0-9A-Z]*"
    new_text = re.sub(pattern, ' ', text)
    return new_text

[docs]def remove_tags(text):
    pattern = "</?[a-zA-Z]* ?[a-zA-Z'=.]* ?/?>"
    new_text = re.sub(pattern, ' ', text)
    return new_text

########################################################################################
[docs]def remove_escapes_and_non_printable(text):
    """
        Apply the codecs escape to decode any escaped characters.
        Then apply a regex to remove any non printable characters
    """
    try:
        new_text0 = codecs.escape_decode(text)[0].decode("utf-8")
    except:
        new_text0 = text
    pattern = "\0|\n|\r|\b|\t|\f|\v"
    new_text1 = re.sub(pattern, " ", new_text0)
    return new_text1


###################################################################
[docs]def add_protocol_if_missing(x):
    """
    Determine if the URL begins with any form of protocol
    and add a default protocol if it is absent.
    """
    p = re.findall(r"^[a-zA-Z]{2,8}://", x)
    if len(p) > 0:
        return x
    else:
        return "http://"+x