diff --git a/bask/bayesgpr.py b/bask/bayesgpr.py
index fcf4633..44c6cd5 100644
--- a/bask/bayesgpr.py
+++ b/bask/bayesgpr.py
@@ -166,6 +166,17 @@ def __init__(
 
     @property
     def theta(self):
+        """The current geometric median of the kernel hyperparameter distribution.
+
+        The returned values are located in log space. Call `BayesGPR.kernel_` to obtain
+        the values their original space.
+
+        Returns
+        -------
+        ndarray
+            Array containing the kernel hyperparameters in log space.
+
+        """
         if self.kernel_ is not None:
             with np.errstate(divide="ignore"):
                 return np.copy(self.kernel_.theta)
@@ -192,6 +203,11 @@ def theta(self, theta):
 
     @contextmanager
     def noise_set_to_zero(self):
+        """Context manager in which the noise of the Gaussian process is 0.
+
+        This is useful when you want to predict the epistemic uncertainty of the
+        Gaussian process without the noise.
+        """
         current_theta = self.theta
         try:
             # Now we set the noise to 0, but do NOT recalculate the alphas!:
@@ -228,7 +244,51 @@ def sample(
         add=False,
         **kwargs
     ):
-        """ Sample from the posterior distribution of the hyper-parameters."""
+        """Sample from the posterior distribution of the hyper-parameters.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_points, n_dims), optional (default: None)
+            Points at which the function is evaluated. If None, it will use the saved
+            datapoints.
+        y : ndarray, shape (n_points,), optional (default: None)
+            Value(s) of the function at `X`. If None, it will use the saved values.
+        noise_vector :
+            Variance(s) of the function at `X`. If None, no additional noise is applied.
+        n_threads : int, optional (default: 1)
+            Number of threads to use during inference.
+            This is currently not implemented.
+        n_desired_samples : int, optional (default: 100)
+            Number of hyperposterior samples to collect during inference. Must be a
+            multiple of `n_walkers_per_thread`.
+        n_burnin : int, optional (default: 0)
+            Number of iterations to discard before collecting hyperposterior samples.
+            Needs to be increased only, if the hyperposterior samples have not reached
+            their typical set yet. Higher values increase the running time.
+        n_thin : int, optional (default: 1)
+            Only collect hyperposterior samples every k-th iteration. This can help
+            reducing the autocorrelation of the collected samples, but reduces the
+            total number of samples.
+        n_walkers_per_thread : int, optional (default: 100)
+            Number of MCMC ensemble walkers to employ during inference.
+        progress : bool, optional (default: False)
+            If True, show a progress bar during inference.
+        priors : list or callable, optional (default: None)
+            Log prior(s) for the kernel hyperparameters. Remember that the kernel
+            hyperparameters are transformed into log space. Thus your priors need to
+            perform the necessary change-of-variables.
+        position : ndarray, shape (n_walkers, n_kernel_dims), optional (default: None)
+            Starting position of the Markov chain. If None, it will use the current
+            position. If this is None as well, it will try to initialize in a small
+            ball.
+        add : bool, optional (default: False)
+            If True, all collected hyperposterior samples will be added to the existing
+            samples in `BayesGPR.chain_`. Otherwise they will be replaced.
+        kwargs : dict
+            Additional keyword arguments for emcee.EnsembleSampler
+
+        """
+
 
         def log_prob_fn(x, gp=self):
             lp = 0
@@ -326,6 +386,43 @@ def fit(
         position=None,
         **kwargs
     ):
+        """Fit the Gaussian process model to the given training data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_points, n_dims)
+            Points at which the function is evaluated. If None, it will use the saved
+            datapoints.
+        y : ndarray, shape (n_points,)
+            Value(s) of the function at `X`. If None, it will use the saved values.
+        noise_vector :
+            Variance(s) of the function at `X`. If None, no additional noise is applied.
+        n_threads : int, optional (default: 1)
+            Number of threads to use during inference.
+            This is currently not implemented.
+        n_desired_samples : int, optional (default: 100)
+            Number of hyperposterior samples to collect during inference. Must be a
+            multiple of `n_walkers_per_thread`.
+        n_burnin : int, optional (default: 0)
+            Number of iterations to discard before collecting hyperposterior samples.
+            Needs to be increased only, if the hyperposterior samples have not reached
+            their typical set yet. Higher values increase the running time.
+        n_walkers_per_thread : int, optional (default: 100)
+            Number of MCMC ensemble walkers to employ during inference.
+        progress : bool, optional (default: False)
+            If True, show a progress bar during inference.
+        priors : list or callable, optional (default: None)
+            Log prior(s) for the kernel hyperparameters. Remember that the kernel
+            hyperparameters are transformed into log space. Thus your priors need to
+            perform the necessary change-of-variables.
+        position : ndarray, shape (n_walkers, n_kernel_dims), optional (default: None)
+            Starting position of the Markov chain. If None, it will use the current
+            position. If this is None as well, it will try to initialize in a small
+            ball.
+        kwargs : dict
+            Additional keyword arguments for BayesGPR.sample
+
+        """
         self.kernel = self._kernel
         self._apply_noise_vector(len(y), noise_vector)
         super().fit(X, y)
@@ -343,6 +440,30 @@ def fit(
         )
 
     def sample_y(self, X, sample_mean=False, noise=False, n_samples=1, random_state=0):
+        """Sample function realizations of the Gaussian process.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_points, n_dims)
+            Points at which to evaluate the functions.
+        sample_mean : bool, optional (default: False)
+            If True, the geometric median of the hyperposterior samples is used as the
+            Gaussian process to sample from. If False, a new set of hyperposterior
+            is used for each new sample.
+        noise : bool, optional (default: False)
+            If True, Gaussian noise is added to the samples.
+        n_samples : int, optional (default: 1)
+            Number of samples to draw from the Gaussian process(es).
+        random_state : int or RandomState or None, optional, default=None
+            Pseudo random number generator state used for random uniform sampling
+            from lists of possible values instead of scipy.stats distributions.
+
+        Returns
+        -------
+        result : ndarray, shape (n_points, n_samples)
+            Samples from the Gaussian process(es)
+
+        """
         rng = check_random_state(random_state)
         if sample_mean:
             if noise:
diff --git a/bask/optimizer.py b/bask/optimizer.py
index 3569e37..ea5befe 100644
--- a/bask/optimizer.py
+++ b/bask/optimizer.py
@@ -168,6 +168,29 @@ def __init__(
         self._next_x = None
 
     def ask(self, n_points=1):
+        """Ask the optimizer for the next point to evaluate.
+
+        If the optimizer is still in its initialization phase, it will return a point
+        as specified by the init_strategy.
+        If the Gaussian process has been fit, a previously computed point as
+
+        Parameters
+        ----------
+        n_points : int
+            Number of points to return. This is currently not implemented and will raise
+            a NotImplementedError.
+
+        Returns
+        -------
+        list
+            A list with the same dimensionality as the optimization space.
+
+        Raises
+        ------
+        NotImplementedError
+            If n_points is != 1, which is not implemented yet.
+
+        """
         if n_points > 1:
             raise NotImplementedError(
                 "Returning multiple points is not implemented yet."
@@ -197,12 +220,54 @@ def tell(
         gp_burnin=10,
         progress=False,
     ):
-        # if y isn't a scalar it means we have been handed a batch of points
+        """Inform the optimizer about the objective function at discrete points.
 
-        # TODO (noise vector):
-        #  1. Replace case should be easy
-        #  2. Add case should add noise values to list
-        #  -> What if noise_vector is None? (have to set noise to 0)
+        Provide values of the objective function at points suggested by `ask()` or other
+        points. By default a new model will be fit to all observations.
+        The new model is used to suggest the next point at which to evaluate the
+        objective. This point can be retrieved by calling `ask()`.
+        To add observations without fitting a new model set `fit` to False.
+        To add multiple observations in a batch pass a list-of-lists for `x`
+        and a list of scalars for `y`.
+
+        Parameters
+        ----------
+        x : list or list of lists
+            Point(s) at which the objective function was evaluated.
+        y : scalar or list
+            Value(s) of the objective function at `x`.
+        noise_vector : list, default=None
+            Variance(s) of the objective function at `x`.
+        fit : bool, optional (default: True)
+            If True, a model will be fitted to the points, if `n_initial_points` points
+            have been evaluated.
+        replace : bool, optional (default: False)
+            If True, the existing data points will be replaced with the one given in
+            `x` and `y`.
+        n_samples : int, optional (default: 0)
+            Number of hyperposterior samples over which to average the acquisition
+            function. More samples make the acquisition function more robust, but
+            increase the running time.
+            Can be set to 0 for `pvrs` and `vr`.
+        gp_samples : int, optional (default: 100)
+            Number of hyperposterior samples to collect during inference. More samples
+            result in a more accurate representation of the hyperposterior, but
+            increase the running time.
+            Has to be a multiple of 100.
+        gp_burnin : int, optional (default: 10)
+            Number of inference iterations to discard before beginning collecting
+            hyperposterior samples. Only needs to be increased, if the hyperposterior
+            after burnin has not settled on the typical set. Drastically increases
+            running time.
+        progress : bool, optional (default: False)
+            If True, show a progress bar during the inference phase.
+
+        Returns
+        -------
+        scipy.optimize.OptimizeResult object
+            Contains the points, the values of the objective function, the search space,
+            the random state and the list of models.
+        """
         if replace:
             self.Xi = []
             self.yi = []
@@ -288,6 +353,30 @@ def tell(
         return create_result(self.Xi, self.yi, self.space, self.rng, models=[self.gp])
 
     def run(self, func, n_iter=1, n_samples=5, gp_burnin=10):
+        """Execute the ask/tell-loop on a given objective function.
+
+        Parameters
+        ----------
+        func : function
+            The objective function to minimize.
+        n_iter : int, optional (default: 1)
+            Number of iterations to perform.
+        n_samples : int, optional (default: 5)
+            Number of hyperposterior samples over which to average the acquisition
+            function.
+        gp_burnin : int, optional (default: 10)
+            Number of inference iterations to discard before beginning collecting
+            hyperposterior samples. Only needs to be increased, if the hyperposterior
+            after burnin has not settled on the typical set. Drastically increases
+            running time.
+
+        Returns
+        -------
+        scipy.optimize.OptimizeResult object
+            Contains the points, the values of the objective function, the search space,
+            the random state and the list of models.
+
+        """
         for _ in range(n_iter):
             x = self.ask()
             self.tell(x, func(x), n_samples=n_samples, gp_burnin=gp_burnin)