diff --git a/llmebench/datasets/dataset_base.py b/llmebench/datasets/dataset_base.py
index 1248ec58..0e88e0d6 100644
--- a/llmebench/datasets/dataset_base.py
+++ b/llmebench/datasets/dataset_base.py
@@ -10,6 +10,42 @@
 
 
 class DatasetBase(ABC):
+    """
+    Base class for datasets
+
+    Implementations of this class need to implement at least three mandatory methods;
+    `metadata()`, `get_data_sample()` and `load_data()`. The purpose of objects of
+    this class is to encapsulate all the subtleties and information for a specific
+    dataset, and provide a consistent way for the framework to read the dataset.
+
+    Attributes
+    ----------
+        None
+
+    Methods
+    -------
+    metadata():
+        Returns metadata for the dataset
+
+    get_data_sample():
+        Returns one sample of data. Useful to see the structure of loaded data
+
+    load_data(data_path="", no_labels=False):
+        Loads data from the given path and returns a list of data samples
+
+    prepare_fewshots(target_data=[], train_data=[], n_shots=1, deduplicate=True):
+        Returns a generator that provides few shot samples for every test sample
+
+    Notes
+    -----
+    - Consider overriding `_deduplicate_train_test` to replace the default "input_id"
+    based de-duplication between train/test
+    - If the data is not JSON serializable, `_stringify_sample`/`_destringify_sample`
+    must be re-implemented to provide serialization/deserialization of samples. This is
+    primarily used for some fewshot sampling methods.
+
+    """
+
     def __init__(self, **kwargs):
         pass
 
@@ -17,40 +53,93 @@ def __init__(self, **kwargs):
     @abstractmethod
     def metadata():
         """
-        Must return a dictionary with the following keys:
-            "citation": str
+        Returns the dataset's metadata
+
+        Arguments
+        ---------
+            None
+
+        Returns
+        -------
+        metadata : dict
+            The returned dictionary _must_ have the following keys:
+            "citation" : str
                 bib-formatted citation for the dataset
-            "language": str|list
+            "language" : str|list
                 Can be one of:
                     "multilingual"
-                    ["ar", "fr", "en"] # List of supported langauges
+                    ["ar", "fr", "en"] # List of supported languages
                     "ar" # Single supported language
                 Languages should be identified by their IETF language tags
-            "download_url": str (optional)
-                URL to data to automatically download if not present
+            The returned dictionary _can_ have the following additional keys:
+            "download_url" : str (optional)
+                URL to data (for automatic downloads)
         """
         pass
 
     @abstractmethod
     def get_data_sample(self):
+        """
+        Returns a single data sample.
+
+        This function is useful to understand the structure of the underlying
+        data. All loaded samples _must_ match this sample.
+
+        Arguments
+        ---------
+            None
+
+        Returns
+        -------
+        sample : dict
+            _Must_ contain at least two keys "input" and "label".
+            "input_id" can be specified to help with de-duplication
+            between train/dev/test data. Can include additional keys.
+        """
         pass
 
     @abstractmethod
     def load_data(self, data_path, no_labels=False):
         """
-        Returns a list of dictionaries,
-        with at least the following keys:
-                "input": <input-instance>
-                "label": <label>
-        The dictionaries can contain other keys as well
-        which will be saved in the cache
+        Load data from data_path.
+
+        Arguments
+        ---------
+        data_path : str|list|dict
+            Path to dataset. Can be a list or dict as well.
+        no_labels : bool
+            Specifies if the data_path has a split with no labels
+
+        Returns
+        -------
+        data : list
+            List of dictionaries, where each dictionary is structured like
+            `get_data_sample()`'s output
         """
         pass
 
-    def deduplicate_train_test(self, train_data, test_data):
+    def _deduplicate_train_test(self, train_data, test_data):
+        """
+        Filter train data to avoid overlap with test data
+
+        The default implementation de-duplicates based on an "input_id"
+        element in the sample dictionary.
+
+        Arguments
+        ---------
+        train_data : list
+            Loaded train data
+        test_data : list
+            Loaded test data
+
+        Returns
+        -------
+        filtered_train_data : list
+            Train data with overlapping test samples removed
+        """
         if len(test_data) > 0 and "input_id" not in test_data[0]:
             logging.warning(
-                "`input_id` not found in data, no deduplication will be run"
+                "`input_id` not found in data, no de-duplication will be run"
             )
             # TODO: Add fallback to input, label deep comparison
             return train_data
@@ -65,18 +154,76 @@ def deduplicate_train_test(self, train_data, test_data):
 
         return final_train_data
 
-    def stringify_sample(self, sample):
+    def _stringify_sample(self, sample):
+        """
+        Serialize data sample into a string.
+
+        Primarily used for some fewshot samplers that work only on strings.
+        By default uses JSON serialization; If the data is not JSON serializable,
+        this function must be re-implemented in the implementing class.
+
+        Arguments
+        ---------
+        sample : dict
+            Input sample, with the same structure as that returned by
+            `get_data_sample()`
+
+        Returns
+        -------
+        new_sample : dict
+            Same as the input sample, except the value associated with the key
+            "input" must be a string
+        """
         new_sample = sample.copy()
         new_sample["input"] = json.dumps(new_sample["input"], ensure_ascii=False)
         return new_sample
 
-    def destringify_sample(self, sample):
+    def _destringify_sample(self, sample):
+        """
+        Deserialize data sample from a string.
+
+        Primarily used for some fewshot samplers that work only on strings.
+        By default uses JSON deserialization; If the data is not JSON deserializable,
+        this function must be re-implemented in the implementing class.
+
+        Arguments
+        ---------
+        sample : dict
+            Output of `_stringify_sample()`
+
+        Returns
+        -------
+        new_sample : dict
+            Sample with the same structure as that returned by
+            `get_data_sample()`
+        """
         new_sample = sample.copy()
         new_sample["input"] = json.loads(new_sample["input"])
         return new_sample
 
     def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True):
-        """Returns a generator for fewshot samples _per test instance_"""
+        """
+        Returns a generator for fewshot samples _per test instance_
+
+        Arguments
+        ---------
+        target_data : list
+            Test samples
+        train_data : list
+            Train/Dev samples to pick few shot samples from
+        n_shots : int
+            Number of samples to pick for each test sample
+        deduplicate : bool, defaults to True
+            Whether the training samples should be de-duplicated (w.r.t test
+            samples).
+
+        Returns
+        -------
+        fewshot_data : generator
+            A generator that returns `n_shots` train samples for every
+            test sample
+        """
+        """"""
 
         # Stringify inputs for few shot
         deserialization_required = False
@@ -85,7 +232,7 @@ def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True):
                 "`input` is not a string, JSON stringifying for few shot preparation"
             )
             deserialization_required = True
-            train_data = [self.stringify_sample(sample) for sample in train_data]
+            train_data = [self._stringify_sample(sample) for sample in train_data]
 
         # Remove empty inputs
         original_sample_count = len(train_data)
@@ -103,7 +250,7 @@ def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True):
         # We discovered some datasets had overlap between train and test
         if deduplicate:
             original_sample_count = len(train_data)
-            train_data = self.deduplicate_train_test(train_data, target_data)
+            train_data = self._deduplicate_train_test(train_data, target_data)
             filtered_sample_count = len(train_data)
             if filtered_sample_count < original_sample_count:
                 logging.warning(
@@ -122,7 +269,7 @@ def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True):
         # For each input sample, get few shot examples
         for idx, input_sample in enumerate(target_data):
             if deserialization_required:
-                input_sample = self.stringify_sample(input_sample)
+                input_sample = self._stringify_sample(input_sample)
             if len(input_sample["input"].strip()) > 0:
                 examples = example_selector.select_examples(input_sample)
             else:
@@ -134,6 +281,6 @@ def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True):
 
             if deserialization_required:
                 # Deserialize example
-                examples = [self.destringify_sample(sample) for sample in examples]
+                examples = [self._destringify_sample(sample) for sample in examples]
 
             yield examples
diff --git a/llmebench/models/HuggingFaceInferenceAPI.py b/llmebench/models/HuggingFaceInferenceAPI.py
index baab23f2..e7805492 100644
--- a/llmebench/models/HuggingFaceInferenceAPI.py
+++ b/llmebench/models/HuggingFaceInferenceAPI.py
@@ -30,6 +30,8 @@
 
 
 class HuggingFaceModelLoadingError(Exception):
+    """Exception class to capture loading errors"""
+
     def __init__(self, failure_message):
         self.failure_message = failure_message
 
@@ -38,15 +40,21 @@ def __str__(self):
 
 
 class HuggingFaceInferenceAPIModel(ModelBase):
-    """An interface to HuggingFace Inference API
-
-    Args:
-        task_type: one of Summarization, Sentence_Similarity, Text_Generation, Text2Text_Generation, Translation,
-          Feature_Extraction, Fill_Mask, Question_Answering, Table_Question_Answering, Text_Classification,
-          Token_Classification, Named_Entity_Recognition, Zero_Shot_Classification, Conversational as found on
-          HuggingFace model's page
-        inference_api_url: the URL to the particular model, as found in the Deploy > Inference API menu in the model's page
-        api_token: HuggingFace API access key (can also be read from enviroment variable HUGGINGFACE_API_TOKEN)
+    """
+    An interface to HuggingFace Inference API
+
+    Arguments
+    ---------
+    task_type : HuggingFaceTaskTypes
+        One of Summarization, Sentence_Similarity, Text_Generation, Text2Text_Generation, Translation,
+        Feature_Extraction, Fill_Mask, Question_Answering, Table_Question_Answering, Text_Classification,
+        Token_Classification, Named_Entity_Recognition, Zero_Shot_Classification, Conversational as found on
+        HuggingFace model's page
+    inference_api_url : str
+        The URL to the particular model, as found in the Deploy > Inference API menu in the model's page
+    api_token : str
+        HuggingFace API access key. If not provided, will be inferred from the environment variable
+        `HUGGINGFACE_API_TOKEN`
     """
 
     def __init__(self, task_type, inference_api_url, api_token=None, **kwargs):
@@ -64,6 +72,27 @@ def __init__(self, task_type, inference_api_url, api_token=None, **kwargs):
         )
 
     def prompt(self, processed_input):
+        """
+        HuggingFace Inference API Implementation
+
+        Arguments
+        ---------
+        processed_input : dictionary
+            Must be a dictionary with one key "inputs", the value of which will
+            depend on the task type. See https://huggingface.co/docs/api-inference/detailed_parameters
+            for detailed parameters.
+
+        Returns
+        -------
+        response : dict
+            Response from the HuggingFace Inference API
+
+        Raises
+        ------
+        HuggingFaceModelLoadingError : Exception
+            This method raises this exception if the model is not yet loaded on
+            HuggingFace. Retrying after a few seconds is the usual remedy.
+        """
         headers = {"Authorization": f"Bearer {self.api_token}"}
         data = json.dumps(processed_input)
         response = requests.request(
@@ -78,7 +107,8 @@ def prompt(self, processed_input):
 
     def summarize_response(self, response):
         """
-        This method will attempt to interpret the output based on the task type. Otherwise, it returns the response object as is.
+        This method will attempt to interpret the output based on the task type.
+        Otherwise, it returns the response object as is.
         """
         output_types = {
             HuggingFaceTaskTypes.Summarization: str,
diff --git a/llmebench/models/OpenAI.py b/llmebench/models/OpenAI.py
index 338d36d8..d86f4b74 100644
--- a/llmebench/models/OpenAI.py
+++ b/llmebench/models/OpenAI.py
@@ -6,14 +6,52 @@
 
 
 class OpenAIModelBase(ModelBase):
+    """
+    OpenAI Model interface. Can be used for models hosted on both OpenAI's platform and
+    on Azure.
+
+    Arguments
+    ---------
+    api_type : str
+        Must be one of "openai" or "azure". If not provided, the implementation will try
+        to induce it from environment variables `OPEN_API_TYPE`, `AZURE_*` or default to
+        "openai"
+    api_base : str
+        URL where the model is hosted. Can be left as None for models hosted on OpenAI's
+        platform. If not provided, the implementation will look at environment variables
+        `OPENAI_API_BASE` or `AZURE_API_URL`
+    api_version : str
+        Version of the API to use. If not provided, the implementation will derive it
+        from environment variables `OPENAI_API_VERSION` or `AZURE_API_VERSION`. Must be
+        left as None for models hosted on OpenAI's platform
+    api_key : str
+        Authentication token for the API. If not provided, the implementation will derive it
+        from environment variables `OPENAI_API_KEY` or `AZURE_API_KEY`.
+    model_name : str
+        Name of the model to use. If not provided, the implementation will derive it from
+        environment variables `OPENAI_MODEL` or `AZURE_ENGINE_NAME`
+    engine_name : str
+        Alternative for `model_name`
+    temperature : float
+        Temperature value to use for the model. Defaults to zero for reproducibility.
+    top_p : float
+        Top P value to use for the model. Defaults to 0.95
+    max_tokens : int
+        Maximum number of tokens to pass to the model. Defaults to 800
+    frequency_penalty : float
+        Frequency Penalty to use for the model.
+    presence_penalty : float
+        Presence Penalty to use for the model.
+    """
+
     def __init__(
         self,
         api_type=None,
         api_base=None,
         api_version=None,
         api_key=None,
-        engine_name=None,
         model_name=None,
+        engine_name=None,
         temperature=0,
         top_p=0.95,
         max_tokens=800,
@@ -70,7 +108,7 @@ def __init__(
 
         if model_name is None:
             raise Exception(
-                "Model/Engine must be provided as model config or enviroment variable `OPENAI_MODEL`/`AZURE_ENGINE_NAME`"
+                "Model/Engine must be provided as model config or environment variable `OPENAI_MODEL`/`AZURE_ENGINE_NAME`"
             )
 
         if api_type == "azure":
@@ -128,6 +166,7 @@ def create_prompt(self, system_message, messages):
         return prompt
 
     def summarize_response(self, response):
+        """Returns the first reply, if available"""
         if (
             "choices" in response
             and isinstance(response["choices"], list)
@@ -136,9 +175,28 @@ def summarize_response(self, response):
         ):
             return response["choices"][0]["text"]
 
-        return None
+        return response
 
     def prompt(self, processed_input):
+        """
+        OpenAI API Completion implementation
+
+        .. warning::
+        This implementation is deprecated and will be removed in future versions. Use
+        `OpenAIModel` instead.
+
+        Arguments
+        ---------
+        processed_input : dict
+            Must be a dictionary with two keys; "system_message" with a string
+            value, and "messages" with a list value, where each element is a
+            dictionary with two string-valued keys, "sender" and "text".
+
+        Returns
+        -------
+        response : OpenAI API response
+            Response from the openai python library
+        """
         system_message = processed_input["system_message"]
         messages = processed_input["messages"]
         prompt = self.create_prompt(system_message, messages)
@@ -151,6 +209,7 @@ def prompt(self, processed_input):
 
 class OpenAIModel(OpenAIModelBase):
     def summarize_response(self, response):
+        """Returns the first reply from the "assistant", if available"""
         if (
             "choices" in response
             and isinstance(response["choices"], list)
@@ -161,9 +220,25 @@ def summarize_response(self, response):
         ):
             return response["choices"][0]["message"]["content"]
 
-        return None
+        return response
 
     def prompt(self, processed_input):
+        """
+        OpenAI API ChatCompletion implementation
+
+        Arguments
+        ---------
+        processed_input : list
+            Must be list of dictionaries, where each dictionary has two keys;
+            "role" defines a role in the chat (e.g. "system", "user") and
+            "content" defines the actual message for that turn
+
+        Returns
+        -------
+        response : OpenAI API response
+            Response from the openai python library
+
+        """
         response = openai.ChatCompletion.create(
             messages=processed_input, **self.model_params
         )
diff --git a/llmebench/models/Petals.py b/llmebench/models/Petals.py
index 1f44a834..b25965e0 100644
--- a/llmebench/models/Petals.py
+++ b/llmebench/models/Petals.py
@@ -7,6 +7,8 @@
 
 
 class PetalsFailure(Exception):
+    """Exception class to map various failure types from the Petals server"""
+
     def __init__(self, failure_type, failure_message):
         self.type_mapping = {
             "processing": "Model Inference failure",
@@ -22,6 +24,24 @@ def __str__(self):
 
 
 class PetalsModel(ModelBase):
+    """
+    Petals Model interface.
+
+    Arguments
+    ---------
+    api_url : str
+        URL where the petals server is hosted. If not provided, the implementation will
+        look at environment variable `PETALS_API_URL`
+    timeout : int
+        Number of seconds before the request to the server is timed out
+    temperature : float
+        Temperature value to use for the model. Defaults to zero for reproducibility.
+    top_p : float
+        Top P value to use for the model. Defaults to 0.95
+    max_tokens : int
+        Maximum number of tokens to pass to the model. Defaults to 1512
+    """
+
     def __init__(
         self,
         api_url=None,
@@ -38,7 +58,10 @@ def __init__(
                 "API url must be provided as model config or environment variable (`PETALS_API_URL`)"
             )
         self.api_timeout = timeout
-        self.request_header = {"type": "open_inference_session", "max_length": 1512}
+        self.request_header = {
+            "type": "open_inference_session",
+            "max_length": max_tokens,
+        }
 
         # BLOOM parameters
         tolerance = 1e-7
@@ -56,12 +79,33 @@ def __init__(
         )
 
     def summarize_response(self, response):
+        """Returns the "outputs" key's value, if available"""
         if "outputs" in response:
             return response["outputs"]
 
-        return None
+        return response
 
     def prompt(self, processed_input):
+        """
+        Petals API Implementation
+
+        Arguments
+        ---------
+        processed_input : dictionary
+            Must be a dictionary with one key "prompt", the value of which
+            must be a string.
+
+        Returns
+        -------
+        response : Petals API response
+            Response from the petals server
+
+        Raises
+        ------
+        PetalsFailure : Exception
+            This method raises this exception if the server responded with a non-ok
+            response
+        """
         with connect(self.api_url, close_timeout=self.api_timeout) as websocket:
             websocket.send(json.dumps(self.request_header))
             connect_message = json.loads(websocket.recv())
diff --git a/llmebench/models/model_base.py b/llmebench/models/model_base.py
index 113bb85a..06b6b39a 100644
--- a/llmebench/models/model_base.py
+++ b/llmebench/models/model_base.py
@@ -21,6 +21,39 @@ def log_retry(retry_state):
 
 
 class ModelBase(object):
+    """
+    Base class for models
+
+    Implementations of this class need to define at least two mandatory methods;
+    `prompt()` and `summarize_response()`. Implementations of this class should target
+    a specific model inference API, such as a platform (Azure, OpenAI), custom
+    hosted inference server (Petals, FastChat) or other model-specific APIs.
+
+    Attributes
+    ----------
+    max_tries : int, defaults to 5
+        Defines how many retries are allowed per-sample in case of failure.
+        Failure is defined by `retry_exceptions`.
+    retry_exceptions : tuple
+        Tuple of exceptions on which the framework should retry the request
+        for any given sample. Specific exceptions should be included by the
+        implementing class, such as HTTP Request failures (in case of HTTP-
+        based APIs).
+
+    Methods
+    -------
+    prompt(processed_input):
+        Method that takes inputs from an asset and makes the actual request
+        to the underlying model inference API.
+
+    summarize_response(response):
+        Method that takes a model response and summarizes it into a simpler
+        form
+
+    run_model(processed_input):
+        Wrapper that calls the `prompt` method and captures exceptions
+    """
+
     def __init__(self, max_tries=5, retry_exceptions=(), **kwargs):
         self.max_tries = max_tries
 
@@ -34,14 +67,68 @@ def __init__(self, max_tries=5, retry_exceptions=(), **kwargs):
         )(self.prompt)
 
     @abstractmethod
-    def prompt(self, **kwargs):
+    def prompt(self, processed_input):
+        """
+        Method that implements communication to the underlying model
+
+        Arguments
+        ---------
+        processed_input : dict
+            Input from an asset. The structure of this will be dependent
+            on a specific model implementation, and must be documented by
+            the class implementation itself
+
+        Returns
+        -------
+        response : mixed
+            Response form the underlying model API
+
+        Notes
+        -----
+        Ideally, this method will never be called directly, but through the
+        `run_model` wrapper which takes care of returning the output in a
+        consistent manner and also handles errors/exceptions.
+        """
         pass
 
     @abstractmethod
     def summarize_response(self, response):
+        """
+        Method that summarizes/simplifies a model's response
+
+        Arguments
+        ---------
+        response : mixed
+            Response from `prompt()`
+
+        Returns
+        -------
+        simplified_response : mixed
+            Should ideally be a short string that summarizes the model's response
+            (e.g. only the actual label instead of scores and other metadata). Will
+            be saved in the summary file for quick debugging. If the response is not
+            simplifiable, return the response object as is.
+        """
         pass
 
     def run_model(self, processed_input):
+        """
+        Wrapper that calls the `prompt` method and captures exceptions
+
+        Arguments
+        ---------
+        processed_input : dict
+            Input from an asset. The structure of this will be dependent
+            on a specific model implementation, and must be documented by
+            the class implementation itself
+
+        Returns
+        -------
+        response : dict
+            Returns a dictionary with the key "response" holding the model's
+            response, or "failure_exception" with the error that occurred when
+            using the model
+        """
         try:
             response = self.prompt(processed_input)
             return {"response": response}
diff --git a/llmebench/tasks/task_base.py b/llmebench/tasks/task_base.py
index 6cfafa39..fed38ae5 100644
--- a/llmebench/tasks/task_base.py
+++ b/llmebench/tasks/task_base.py
@@ -5,20 +5,111 @@
 
 
 class TaskBase(ABC):
+    """
+    Base class for tasks
+
+    Implementations of this class need to implement one method, `evaluate()`
+    which takes true and predicted labels, and returns some score over
+    these.
+
+    Attributes
+    ----------
+    dataset : DatasetBase
+        The dataset that is currently being evaluated by the task
+    seed : int
+        Seed for initializing pseudo random generators for reproducible
+        results
+
+    Methods
+    -------
+    get_random_prediction(label_set):
+        Helper method to choose a random classification label
+
+    get_random_continuous_prediction(score_range):
+        Helper method to choose a random regression prediction
+
+    create_random_binary_array(score_range):
+        Helper method to generate random multi-label binary array
+
+    evaluate(true_labels, predicted_labels):
+        Method to evaluate the predictions and return appropriate scores
+    """
+
     def __init__(self, dataset, seed=2023, **kwargs):
         self.dataset = dataset
 
         random.seed(seed)
+        np.random.seed(seed)
 
     def get_random_prediction(self, label_set):
+        """
+        Helper method to choose a random classification label
+
+        Arguments
+        ---------
+        label_set : set
+            Set of unique labels valid for the task
+
+        Returns
+        -------
+        label : mixed
+            A label chosen at random from the `label_set`
+        """
         return random.choice(list(label_set))
 
     def get_random_continuous_prediction(self, score_range):
+        """
+        Helper method to choose a random regression prediction
+
+        Arguments
+        ---------
+        score_range : tuple
+            Tuple (min_val, max_val) that defines the range from
+            which a random number will be chosen
+
+        Returns
+        -------
+        score : float
+            A number chosen at random between `min_val` and `max_val`
+        """
         return random.uniform(score_range[0], score_range[1])
 
     def create_random_binary_array(self, length):
+        """
+        Helper method to generate random multi-label binary array
+
+        Arguments
+        ---------
+        length : int
+            Length of the generated array
+
+        Returns
+        -------
+        binary_array : list
+            List of length `length` where each element is either 0 or 1
+            at random.
+        """
         return np.random.randint(low=0, high=2, size=(length,))
 
     @abstractmethod
     def evaluate(self, true_labels, predicted_labels):
+        """
+        Method to evaluate the predictions and return appropriate scores
+
+        Arguments
+        ---------
+        true_labels : list
+            List of labels (should match "label" key from the dataset's
+            `get_data_sample()`)
+        predicted_labels : list
+            List of predicted labels (should match "label" key from the
+             dataset's `get_data_sample()` in structure and type)
+
+        Returns
+        -------
+        scores : dict
+            Dictionary of one or more elements, each representing a metric
+            computed from the predictions. Examples are "Accuracy", "F1",
+            "Pearson correlation" etc.
+        """
         pass