Skip to content

Commit

Permalink
merging
Browse files Browse the repository at this point in the history
  • Loading branch information
dkrasner committed Dec 7, 2013
2 parents 883ed6a + 3a60d69 commit 4913519
Showing 1 changed file with 1 addition and 38 deletions.
39 changes: 1 addition & 38 deletions rosetta/text/vw_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,44 +422,7 @@ def _prob_func(self, df, rows, cols, c_rows, c_cols):

return df

def predict_old(self, tokenized_text):
"""
Returns a probability distribution over topics given that a (tokenized)
document is equal to tokenized_text.
This is NOT equivalent to prob_token_topic(c_token=tokenized_text),
since that is an OR statement about the tokens, and this is an AND.
Parameters
----------
tokenized_text : List of strings
Represents the tokens that are in some document text.
Returns
-------
prob_topics : Series
self.pr_topic_g_doc is an example of a (large) frame of this type.
Notes
-----
P(topic | tok1, tok2) \propto P(topic) P(tok1, tok2 | topic)
= P(topic) P(tok1 | topic) P(tok2 | topic)
"""
# P(topic | tok1, tok2) \propto P(topic) P(tok1, tok2 | topic)
# = P(topic) P(tok1 | topic) P(tok2 | topic)

# Multiply out P(tok1 | topic) P(tok2 | topic)
na_val = 1. / self.num_topics
fun = lambda tok: (
self.prob_token_topic(token=tok, topic=self.topics).fillna(na_val)
.values.ravel())
probs = reduce(
lambda x, y: x * y, (fun(tok) for tok in tokenized_text))

# Multiply by P(topic)
probs = self.pr_topic * probs

return probs / probs.sum()


def predict(
self, tokenized_text, maxiter=50, atol=1e-3, raise_on_unknown=False):
Expand Down

0 comments on commit 4913519

Please sign in to comment.