I did something similar not to long ago. I was pleasantly surprised by how well Pandas and Numpy play together, and the resulting speed when sticking to vectorized operations.
The example below doesn't require any other files than the source file. Modify table to your needs.
from StringIO import StringIO
import pandas as pd
import numpy as np
src = """id1, id2, keyword, freq, gp1, gps2
222, 111, #paris, 100, loc1, loc2
444, 234, have, 1000, loc3, loc4
434, 134, #USA, 30, loc5, loc6
234, 234, she, 600, loc1, loc2
523, 5234,mobile, 900, loc3, loc4
"""
src_handle = StringIO(src)
blacklist_words = """
have she and did
""".split()
# Separate by comma and remove whitespace
table = pd.read_table(src_handle, sep=",\s*")
# You can create a single filter by straight-out comparison
filter_have = table["keyword"] == "have"
# Which you can use as a key directly
print table[filter_have]
# We'll solve this by building the filter you need and applying it.
def filter_on_blacklisted_words(keyword, blacklist_words, dataframe):
"""Filter a Pandas dataframe by removing any rows that has column {keyword}
in blacklist. Try to keep things vectorized for performance.
"""
# In the beginning, accept all values, and take the number of values from
# the dataframe we're using. Zeros is falsey.
blacklist_filter = np.zeros_like(dataframe[keyword])
for word in blacklist_words:
blacklist_filter = np.logical_or(blacklist_filter,
dataframe[keyword] == word)
return dataframe[np.logical_not(blacklist_filter)]
print filter_on_blacklisted_words("keyword", blacklist_words, table)