elpaco-escience · bvreede · Dec 1, 2023 · Mar 31, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
@@ -2,6 +2,8 @@
 
 ### To enable this githook, run:
 ### git config --local core.hooksPath .githooks
+### to disable:
+### git config --unset core.hooksPath
 
 echo "Script $0 triggered ..."
 

diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb
@@ -63,7 +63,7 @@
     {
      "data": {
       "text/plain": [
-       "<sktalk.corpus.conversation.Conversation at 0x10ea2bd60>"
+       "<sktalk.corpus.conversation.Conversation at 0x116bc4af0>"
       ]
      },
      "execution_count": 2,
@@ -92,16 +92,16 @@
     {
      "data": {
       "text/plain": [
-       "[Utterance(utterance='0', participant='S', time=(0, 1500), begin='00:00:00.000', end='00:00:01.500', metadata=None),\n",
-       " Utterance(utterance=\"mm I'm glad I saw you⇗\", participant='S', time=(1500, 2775), begin='00:00:01.500', end='00:00:02.775', metadata=None),\n",
-       " Utterance(utterance=\"I thought I'd lost you (0.3)\", participant='S', time=(2775, 3773), begin='00:00:02.775', end='00:00:03.773', metadata=None),\n",
-       " Utterance(utterance=\"⌈no I've been here for a whi:le⌉,\", participant='H', time=(4052, 5515), begin='00:00:04.052', end='00:00:05.515', metadata=None),\n",
-       " Utterance(utterance='⌊xxx⌋ (0.3)', participant='S', time=(4052, 5817), begin='00:00:04.052', end='00:00:05.817', metadata=None),\n",
-       " Utterance(utterance=\"⌊hm:: (.) if ʔI couldn't boʔrrow, (1.3) the second (0.2) book of readings fo:r\", participant='S', time=(6140, 9487), begin='00:00:06.140', end='00:00:09.487', metadata=None),\n",
-       " Utterance(utterance='commu:nicating acro-', participant='H', time=(12888, 14050), begin='00:00:12.888', end='00:00:14.050', metadata=None),\n",
-       " Utterance(utterance='no: for family gender and sexuality', participant='H', time=(14050, 17014), begin='00:00:14.050', end='00:00:17.014', metadata=None),\n",
-       " Utterance(utterance=\"+≋ ah: that's the second on is itʔ\", participant='S', time=(17014, 18611), begin='00:00:17.014', end='00:00:18.611', metadata=None),\n",
-       " Utterance(utterance=\"+≋ I think it's s⌈ame family gender⌉ has a second book\", participant='H', time=(18611, 21090), begin='00:00:18.611', end='00:00:21.090', metadata=None)]"
+       "[Utterance(utterance='0', participant='S', time=[0, 1500], begin='00:00:00.000', end='00:00:01.500', metadata=None, utterance_clean='S x150_1500x15', utterance_list=['S', 'x150_1500x15'], n_words=2, n_characters=13, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"mm I'm glad I saw you⇗\", participant='S', time=[1500, 2775], begin='00:00:01.500', end='00:00:02.775', metadata=None, utterance_clean='S mm Im glad I saw you x151500_2775x15', utterance_list=['S', 'mm', 'Im', 'glad', 'I', 'saw', 'you', 'x151500_2775x15'], n_words=8, n_characters=31, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"I thought I'd lost you (0.3)\", participant='S', time=[2775, 3773], begin='00:00:02.775', end='00:00:03.773', metadata=None, utterance_clean='S I thought Id lost you x152775_3773x15 x153773_4052x15', utterance_list=['S', 'I', 'thought', 'Id', 'lost', 'you', 'x152775_3773x15', 'x153773_4052x15'], n_words=8, n_characters=48, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"⌈no I've been here for a whi:le⌉,\", participant='H', time=[4052, 5515], begin='00:00:04.052', end='00:00:05.515', metadata=None, utterance_clean='H no Ive been here for a while x154052_5515x15', utterance_list=['H', 'no', 'Ive', 'been', 'here', 'for', 'a', 'while', 'x154052_5515x15'], n_words=9, n_characters=38, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance='⌊xxx⌋ (0.3)', participant='S', time=[4052, 5817], begin='00:00:04.052', end='00:00:05.817', metadata=None, utterance_clean='S xxx x154052_5817x15 x155817_6140x15', utterance_list=['S', 'xxx', 'x154052_5817x15', 'x155817_6140x15'], n_words=4, n_characters=34, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"⌊hm:: (.) if ʔI couldn't boʔrrow, (1.3) the second (0.2) book of readings fo:r\", participant='S', time=[6140, 9487], begin='00:00:06.140', end='00:00:09.487', metadata=None, utterance_clean='S hm  if ʔI couldnt boʔrrow x156140_9487x15 the second book of readings for x159487_12888x15', utterance_list=['S', 'hm', 'if', 'ʔI', 'couldnt', 'boʔrrow', 'x156140_9487x15', 'the', 'second', 'book', 'of', 'readings', 'for', 'x159487_12888x15'], n_words=14, n_characters=78, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance='commu:nicating acro-', participant='H', time=[12888, 14050], begin='00:00:12.888', end='00:00:14.050', metadata=None, utterance_clean='H communicating acro x1512888_14050x15', utterance_list=['H', 'communicating', 'acro', 'x1512888_14050x15'], n_words=4, n_characters=35, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance='no: for family gender and sexuality', participant='H', time=[14050, 17014], begin='00:00:14.050', end='00:00:17.014', metadata=None, utterance_clean='H no for family gender and sexuality x1514050_17014x15', utterance_list=['H', 'no', 'for', 'family', 'gender', 'and', 'sexuality', 'x1514050_17014x15'], n_words=8, n_characters=47, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"+≋ ah: that's the second on is itʔ\", participant='S', time=[17014, 18611], begin='00:00:17.014', end='00:00:18.611', metadata=None, utterance_clean='S  ah thats the second on is itʔ x1517014_18611x15', utterance_list=['S', 'ah', 'thats', 'the', 'second', 'on', 'is', 'itʔ', 'x1517014_18611x15'], n_words=9, n_characters=41, time_to_next=None, dyadic=None, FTO=None),\n",
+       " Utterance(utterance=\"+≋ I think it's s⌈ame family gender⌉ has a second book\", participant='H', time=[18611, 21090], begin='00:00:18.611', end='00:00:21.090', metadata=None, utterance_clean='H  I think its same family gender has a second book x1518611_21090x15', utterance_list=['H', 'I', 'think', 'its', 'same', 'family', 'gender', 'has', 'a', 'second', 'book', 'x1518611_21090x15'], n_words=12, n_characters=57, time_to_next=None, dyadic=None, FTO=None)]"
       ]
      },
      "execution_count": 3,
@@ -225,7 +225,7 @@
     {
      "data": {
       "text/plain": [
-       "[<sktalk.corpus.conversation.Conversation at 0x10ea2bd60>]"
+       "[<sktalk.corpus.conversation.Conversation at 0x116bc4af0>]"
       ]
      },
      "execution_count": 7,
@@ -256,6 +256,92 @@
    "source": [
     "GCSAusE.write_json(path = \"CGSAusE.json\")\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analyzing turn-taking dynamics\n",
+    "\n",
+    "When creating a `Conversation` object, a number of calculations and transformations are performed on the `Utterance` objects within.\n",
+    "For example, the number of words in each utterance is calculated, and stored under `Utterance.n_words`.\n",
+    "You can see this for a specific utterance as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cha01.utterances[0].n_words"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "More sophisticated calculations can be performed, but do not happen automatically.\n",
+    "An example of this is the calculation of the Floor Transfer Offset (FTO) per utterance.\n",
+    "FTO is defined as the difference between the time that a turn starts, and the end of the most relevant prior turn by the other participant.\n",
+    "If there is overlap between these turns, the FTO is negative.\n",
+    "If there is a pause between these utterances, the FTO is positive.\n",
+    "\n",
+    "We can calculate the FTOs of the utterances in a conversation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1500] S - FTO: None\n",
+      "[1500, 2775] S - FTO: None\n",
+      "[2775, 3773] S - FTO: None\n",
+      "[4052, 5515] H - FTO: 279\n",
+      "[4052, 5817] S - FTO: None\n",
+      "[6140, 9487] S - FTO: 625\n",
+      "[12888, 14050] H - FTO: 3401\n",
+      "[14050, 17014] H - FTO: 4563\n",
+      "[17014, 18611] S - FTO: 0\n",
+      "[18611, 21090] H - FTO: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "cha01.calculate_FTO()\n",
+    "\n",
+    "for utterance in cha01.utterances[:10]:\n",
+    "    print(f'{utterance.time} {utterance.participant} - FTO: {utterance.FTO}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To determine which prior turn is the relevant turn for FTO calculation, the following criteria are used to find a relevant utterance prior to an utterance U:\n",
+    "\n",
+    "- the relevant utterance must be by another participant\n",
+    "- the relevant utterance must be the most recent utterance by that participant\n",
+    "- the relevant utterance must have started more than a specified number of ms before the start of U. This time defaults to 200 ms, but can be changed with the `planning_buffer` argument.\n",
+    "- the relevant utterance must be partly or entirely within the context window. The context window is defined as 10s (or 10000ms) prior to the utterance U. The size of this window can be changed with the `window` argument.\n",
+    "- within the context window, there must be a maximum of 2 speakers, which can be changed to 3 with the `n_participants` argument."
+   ]
   }
  ],
  "metadata": {

diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py
@@ -1,11 +1,12 @@
 import warnings
+from typing import Optional
 from .utterance import Utterance
 from .write.writer import Writer
 
 
 class Conversation(Writer):
     def __init__(
-        self, utterances: list["Utterance"], metadata: dict = None  # noqa: F821
+        self, utterances: list["Utterance"], metadata: Optional[dict] = None, suppress_warnings: bool = False  # noqa: F821
     ) -> None:
         """Representation of a transcribed conversation
 
@@ -26,7 +27,7 @@ def __init__(
             if not isinstance(utterance, Utterance):
                 raise TypeError(errormsg)
         # The list can be empty. This would be weird and the user needs to be warned.
-        if not self._utterances:
+        if not self._utterances and not suppress_warnings:
             warnings.warn(
                 "This conversation appears to be empty: no Utterances are read.")
 
@@ -68,3 +69,169 @@ def asdict(self):
             dict: dictionary containing Conversation metadata and Utterances
         """
         return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]}
+
+    def _subconversation_by_index(self,
+                                  index: int,
+                                  before: int = 0,
+                                  after: Optional[int] = None) -> "Conversation":
+        """Select utterances to provide context as a sub-conversation
+
+        Args:
+            index (int): The index of the utterance for which to provide context
+            before (int, optional): The number of utterances prior to indicated utterance. Defaults to 0.
+            after (int, optional): The number of utterances after the indicated utterance. Defaults to None,
+                which then assumes the same value as `before`.
+
+        Raises:
+            IndexError: Index provided must be within range of utterances
+
+        Returns:
+            Conversation: Conversation object without metadata, containing a reduced set of utterances
+        """
+        if index < 0 or index >= len(self._utterances):
+            raise IndexError("Index out of range")
+        if after is None:
+            after = before
+        if index - before < 0:
+            before = index
+        if index + after + 1 > len(self._utterances):
+            after = len(self._utterances) - index - 1
+        returned_utterances = self._utterances[index-before:index+after+1]
+        return Conversation(utterances=returned_utterances, suppress_warnings=True)
+
+    def _subconversation_by_time(self,
+                                 index: int,
+                                 before: int = 0,
+                                 after: Optional[int] = None,
+                                 exclude_utterance_overlap: bool = False) -> "Conversation":
+        """Select utterances to provide context as a sub-conversation
+
+        Args:
+            index (int): The index of the utterance for which to provide context
+            before (int, optional): The time in ms preceding the utterance's begin. Defaults to 0.
+            after (int, optional): The time in ms following the utterance's end. Defaults to None,
+                which then assumes the same value as `before`.
+            exclude_utterance_overlap (bool, optional): If True, the duration of the
+                utterance itself is not used to identify overlapping utterances, and only
+                the window before or after the utterance is used. Defaults to False.
+
+        Returns:
+            Conversation: Conversation object without metadata, containing a reduced set of utterances
+        """
+        if index < 0 or index >= len(self._utterances):
+            raise IndexError("Index out of range")
+        if after is None:
+            after = before
+        try:
+            begin = self._utterances[index].time[0] - before
+            end = self._utterances[index].time[1] + after
+            if exclude_utterance_overlap and before == 0:  # only overlap with window following utterance
+                begin = self._utterances[index].time[1]
+            elif exclude_utterance_overlap and after == 0:  # only overlap with window preceding utterance
+                end = self._utterances[index].time[0]
+            returned_utterances = [
+                u for u in self._utterances if self.overlap(begin, end, u.time) or u == self._utterances[index]]
+        except (TypeError, IndexError):
+            returned_utterances = []
+        return Conversation(utterances=returned_utterances, suppress_warnings=True)
+
+    def count_participants(self, except_none: bool = False) -> int:
+        """Count the number of participants in a conversation
+
+        Importantly: if one of the utterances has no participant, it is counted
+        as a separate participant (None). If you want to exclude these, set
+        `except_none` to True.
+
+        Args:
+            except_none (bool, optional): if `True`, utterances without a participant are not counted. Defaults to `False`.
+
+        Returns:
+            int: number of participants
+        """
+        participants = [u.participant for u in self.utterances]
+        if except_none:
+            participants = [p for p in participants if p is not None]
-        participants = [u.participant for u in self.utterances]
-        if except_none:
-            participants = [p for p in participants if p is not None]
+        participants = [u.participant for u in self._utterances if not except_none or u.participant is not None]
+
-        participants = [u.participant for u in self.utterances]
-        if except_none:
-            participants = [p for p in participants if p is not None]
+        participants = [u.participant for u in self._utterances if not except_none or u.participant is not None]
+
+        return len(set(participants))
+
+    def _update(self, field: str, values: list, **kwargs):
+        """
+        Update the all utterances in the conversation with calculated values
+
+        This function also stores relevant arguments in the Conversation metadata.
+
+        Args:
+            field (str): field of the Utterance to update
+            values (list): list of values to update each utterance with
+            kwargs (dict): information about the calculation to store in the Conversation metadata
+        """
+        if len(values) != len(self.utterances):
-        if len(values) != len(self.utterances):
+        if len(values) != len(self._utterances):
-        if len(values) != len(self.utterances):
+        if len(values) != len(self._utterances):
+            raise ValueError(
+                "The number of values must match the number of utterances")
+        metadata = {field: kwargs}
+        try:
+            self._metadata["Calculations"].update(metadata)
+        except KeyError:
+            self._metadata = {"Calculations": metadata}
-        try:
-            self._metadata["Calculations"].update(metadata)
-        except KeyError:
-            self._metadata = {"Calculations": metadata}
+        self._metadata.setdefault("Calculations", {}).update(metadata)
-        try:
-            self._metadata["Calculations"].update(metadata)
-        except KeyError:
-            self._metadata = {"Calculations": metadata}
+        self._metadata.setdefault("Calculations", {}).update(metadata)
+        for index, utterance in enumerate(self.utterances):
-        for index, utterance in enumerate(self.utterances):
+        for index, utterance in enumerate(self._utterances):
-        for index, utterance in enumerate(self.utterances):
+        for index, utterance in enumerate(self._utterances):
+            setattr(utterance, field, values[index])
+
+    def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_participants: int = 2):
+        """Calculate Floor Transfer Offset (FTO) per utterance
+
+        FTO is defined as the difference between the time that a turn starts and the
+        end of the most relevant prior turn by the other participant, which is not
+        necessarily the prior utterance.
+
+        An utterance does not receive an FTO if there are preceding utterances
+        within the window that do not have timing information, or if it lacks
+        timing information itself.
+
+        To be a relevant prior turn, the following conditions must be met, respective to utterance U:
+        - the utterance must be by another speaker than U
+        - the utterance by the other speaker must be the most recent utterance by that speaker
+        - the utterance must have started before utterance U, more than `planning_buffer` ms before.
+        - the utterance must be partly or entirely within the context window (`window` ms prior
+            to the start of utterance U)
+        - within the context window, there must be a maximum of `n_participants` speakers.
+
+        Args:
+            window (int, optional): the time in ms prior to utterance in which a
+                relevant preceding utterance can be found. Defaults to 10000.
+            planning_buffer (int, optional): minimum speaking time in ms to allow for a response.
+                Defaults to 200.
+            n_participants (int, optional): maximum number of participants overlapping with
+                the utterance and preceding window. Defaults to 2.
+        """
+        values = []
+        for index, utterance in enumerate(self.utterances):
-        for index, utterance in enumerate(self.utterances):
+        for index, utterance in enumerate(self._utterances):
-        for index, utterance in enumerate(self.utterances):
+        for index, utterance in enumerate(self._utterances):
+            sub = self._subconversation_by_time(
+                index=index,
+                before=window,
+                after=0,
+                exclude_utterance_overlap=True)
+            if not 2 <= sub.count_participants() <= n_participants:
+                values.append(None)
+                continue
+            potentials = [
+                u for u in sub.utterances if utterance.relevant_for_fto(u, planning_buffer)]
+            try:
+                relevant = potentials[-1]
+                values.append(relevant.until(utterance))
+            except IndexError:
+                values.append(None)
+        self._update("FTO", values,
+                     window=window,
+                     planning_buffer=planning_buffer,
+                     n_participants=n_participants)
+
+    @staticmethod
+    def overlap(begin: int, end: int, time: list):
+        # there is overlap if:
+        # time[0] falls between begin and end
+        # time[1] falls between and end
+        # time[0] is before begin and time[1] is after end
+        if time is None:
+            return False
+        if begin <= time[0] <= end or begin <= time[1] <= end:
+            return True
+        return time[0] <= begin and time[1] >= end
diff --git a/sktalk/corpus/parsing/cha.py b/sktalk/corpus/parsing/cha.py
@@ -29,24 +29,16 @@ def _to_utterance(chat_utterance) -> Utterance:
             time=chat_utterance.time_marks,
             utterance=str(chat_utterance.tiers),
         )
-        utterance.begin, utterance.end = ChaFile._split_time(utterance.time)
         utterance.utterance = ChaFile._clean_utterance(utterance.utterance)
+        try:
+            utterance.time = list(utterance.time)
+        except TypeError:
+            utterance.time = None
         return utterance
 
     def _extract_metadata(self):
         return self._pla_reader().headers()[0]
 
-    @staticmethod
-    def _split_time(time):
-        if time is None:
-            return None, None
-        begin, end = str(time).split(", ")
-        begin = begin.replace("(", "")
-        end = end.replace(")", "")
-        begin = InputFile._to_timestamp(begin)
-        end = InputFile._to_timestamp(end)
-        return (begin, end)
-
     @staticmethod
     def _clean_utterance(utterance):
         utterance = str(utterance)