Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More flexible handling of encoding error #10

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion deltas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .apply import apply
from .apply_get_a import apply_get_a
from .apply_get_b import apply_get_b
from .operations import Operation, Insert, Delete, Equal
from .algorithms.diff_engine import DiffEngine
from .algorithms import segment_matcher, SegmentMatcher
Expand All @@ -11,7 +13,7 @@
from .about import (__name__, __version__, __author__, __author_email__,
__description__, __license__, __url__)

__all__ = [apply,
__all__ = [apply, apply_get_a, apply_get_b,
Operation, Insert, Delete, Equal,
DiffEngine,
segment_matcher, SegmentMatcher,
Expand Down
20 changes: 20 additions & 0 deletions deltas/apply_get_a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def apply_get_a(operations_diff_file):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the goal of this method? Maybe some docs or a test would be welcome.

length_a = max([operation["a2"] for operation in operations_diff_file])
a = [''] * length_a

for operation in operations_diff_file:

if operation["name"] == "equal" or operation["name"] == "delete":
#print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2])))
if "tokens" in operation.keys():
a[operation["a1"]:operation["a2"]] = operation["tokens"]

elif operation["name"] == "insert":
#print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2])))
pass

else:
raise TypeError("Unexpected operation type " + \
"{0}".format(type(operation)))

return ' '.join(a)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why join everything with a space?

21 changes: 21 additions & 0 deletions deltas/apply_get_b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
def apply_get_b(operations_diff_file):

length_b = max([operation["b2"] for operation in operations_diff_file])
b = [''] * length_b

for operation in operations_diff_file:

if operation["name"] == "equal" or operation["name"] == "insert":
#print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2])))
if "tokens" in operation.keys():
b[operation["b1"]:operation["b2"]] = operation["tokens"]

elif operation["name"] == "delete":
#print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2])))
pass

else:
raise TypeError("Unexpected operation type " + \
"{0}".format(type(operation)))

return ' '.join(b)
2 changes: 1 addition & 1 deletion deltas/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __new__(cls, a1, a2, b1, b2, name=None):
return Operation.__new__(cls, "equal", a1, a2, b1, b2)

def relevant_tokens(self, a, b):
return a[self.a1:self.a2]
return b[self.b1:self.b2]


def print_operations(operations, a, b):
Expand Down
4 changes: 2 additions & 2 deletions deltas/segmenters/segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class MatchableSegment(Segment):

def initialize(self, *args, **kwargs):
super().initialize(*args, **kwargs)
self.sha1 = hashlib.sha1(bytes(str(self), 'utf-8'))
self.sha1 = hashlib.sha1(bytes(str(self), 'utf-8', errors = "replace"))
self.match = None

def __eq__(self, other):
Expand All @@ -117,7 +117,7 @@ def __setstate__(self, args): self.initialize(*args)

def append(self, subsegment):
super().append(subsegment)
self.sha1.update(bytes(str(subsegment), 'utf-8'))
self.sha1.update(bytes(str(subsegment), 'utf-8', errors = "replace"))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 this is a good idea.


def extend(self, subsegments):
for subsegment in subsegments:
Expand Down
17 changes: 17 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Faire l'exemple suivant :

- un chien est un sale animal
- Iris est un sale animal
- Iris est un sale chat

# incompréhension

- fonctionnement de la fonction apply : pourquoi yield ?

# Différences dans les operations

- dans les diff, il y a les tokens qui permettent de reconstruire le fichier, pas dans le apply **voilà pouruqoi le package original ne permet pas cette fonction**

# Manuel d'utilisation

- s'assurer que l'on a bien le même tokenizer de l'un à l'autre
33 changes: 33 additions & 0 deletions test_apply_get_a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from deltas import segment_matcher, text_split
from deltas import apply_get_a, apply_get_b
from deltas import Operation, Insert, Delete, Equal

import deltas
from pprint import pprint

a = text_split.tokenize("This is some text. This is some other text.")
b = text_split.tokenize("This is some other text. This is some text.")
operations = segment_matcher.diff(a, b)

operations_format = []
for op in operations:
tmp = {
'name': op.name,
'a1': op.a1,
'b1': op.b1,
'a2': op.a2,
'b2': op.b2,
'tokens' : [str(token) for token in op.relevant_tokens(a,b)]
}
operations_format.append(tmp)

print(apply_get_a(operations_format))
print(apply_get_b(operations_format))

operations_new = []
for op in operations_format:
tmp = Operation(name = op["name"], a1 = op["a1"], a2 = op["a2"], b1 = op["b1"], b2 = op["b2"])
operations_new.append(tmp)

print(operations)
print(operations_new)