Skip to content

Commit e6b86ff

Browse files
committed
script
1 parent 9e7ca7e commit e6b86ff

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed

src/redturtle/rsync/scripts/__init__.py

Whitespace-only changes.
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# -*- coding: utf-8 -*-
2+
# documentazione: ....
3+
from zope.interface import Interface
4+
from Acquisition import aq_base
5+
import logging
6+
from plone import api
7+
import requests
8+
from requests.adapters import HTTPAdapter
9+
from requests.packages.urllib3.util.retry import Retry
10+
from plone.namedfile.file import NamedBlobImage
11+
from z3c.relationfield.relation import RelationValue
12+
from zope.lifecycleevent import ObjectModifiedEvent
13+
from zope.event import notify
14+
from Products.Five.utilities.marker import mark
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class TimeoutHTTPAdapter(HTTPAdapter):
20+
def __init__(self, *args, **kwargs):
21+
if "timeout" in kwargs:
22+
self.timeout = kwargs["timeout"]
23+
del kwargs["timeout"]
24+
super(TimeoutHTTPAdapter, self).__init__(*args, **kwargs)
25+
26+
def send(self, request, **kwargs):
27+
timeout = kwargs.get("timeout")
28+
if timeout is None:
29+
kwargs["timeout"] = self.timeout
30+
return super(TimeoutHTTPAdapter, self).send(request, **kwargs)
31+
32+
33+
# https://dev.to/ssbozy/python-requests-with-retries-4p03
34+
def requests_retry_session(
35+
retries=3,
36+
backoff_factor=0.3,
37+
status_forcelist=(500, 501, 502, 503, 504),
38+
timeout=5.0,
39+
session=None,
40+
):
41+
session = session or requests.Session()
42+
retry = Retry(
43+
total=retries,
44+
read=retries,
45+
connect=retries,
46+
backoff_factor=backoff_factor,
47+
status_forcelist=status_forcelist,
48+
)
49+
# adapter = HTTPAdapter(max_retries=retry)
50+
adapter = TimeoutHTTPAdapter(max_retries=retry, timeout=timeout)
51+
session.mount('http://', adapter)
52+
session.mount('https://', adapter)
53+
return session
54+
55+
http = requests_retry_session(retries=7, timeout=30.0)
56+
57+
58+
class ISynced(Interface):
59+
"""marker for synced content"""
60+
61+
62+
def json_extractor(container, response, **kwargs):
63+
return response.json()
64+
65+
66+
def image_extractor(container, response, **kwargs):
67+
if not response.headers.get('content-type', '').startswith('image/'):
68+
logger.error('invalid for image_extractor %s (%r)', response.url, response.headers.get('content-type', ''))
69+
return None
70+
return {
71+
'title': kwargs.get('name', response.url.split('/')[-1]),
72+
'image': NamedBlobImage(data=response.content, filename=response.url.split('/')[-1])
73+
}
74+
75+
76+
def page_creator(container, data, id=None, portal_type='Document', **kwargs):
77+
obj = api.content.create(container, type=portal_type, id=id, **data)
78+
logger.warning('created %s', obj.absolute_url())
79+
if 'review_state' in kwargs:
80+
if api.content.get_state(obj) != kwargs['review_state']:
81+
api.content.transition(obj, to_state=kwargs['review_state'])
82+
# try:
83+
# api.content.transition(obj, to_state=kwargs['review_state'])
84+
# except api.exc.InvalidParameterError:
85+
# logger.error('unable to set transition state for %s to %s', obj.absolute_url(), kwargs['review_state'])
86+
mark(obj, ISynced)
87+
obj.reindexObject(idxs=['object_provides'])
88+
return obj
89+
90+
91+
def page_delete(obj):
92+
logger.warning('delete %s', obj.absolute_url())
93+
api.content.delete(obj)
94+
return None
95+
96+
97+
def page_update(obj, data, **kwargs):
98+
changed_fields = []
99+
for fieldname, new_value in data.items():
100+
# TODO: verificare che il fieldname sia nello schema dell'obj ?
101+
# TODO: qual'è il modo corretto/generale di fare setter di un field ?
102+
# TODO: vedere z3c.form e come fa lui a vedere se le modifiche sono effettive
103+
# o se non è stato modificato nulla ?
104+
old_value = getattr(aq_base(obj), fieldname, None)
105+
if isinstance(new_value, RelationValue) and isinstance(old_value, RelationValue):
106+
changed = (new_value.to_id != old_value.to_id)
107+
else:
108+
changed = (new_value != old_value)
109+
if changed:
110+
setattr(obj, fieldname, new_value)
111+
changed_fields.append(fieldname)
112+
if changed_fields:
113+
notify(ObjectModifiedEvent(obj))
114+
# BBB: la reindexObject modifica la modification_date, la azzeriamo
115+
# di nuovo col valore originale se esiste
116+
if data.get('modification_date'):
117+
setattr(obj, 'modification_date', data['modification_date'])
118+
obj.reindexObject(idxs=['modified'])
119+
logger.warning('update %s fields:%r', obj.absolute_url(), changed_fields)
120+
return obj
121+
122+
123+
def obj_getter(container, remoteid):
124+
return container.get(remoteid)
125+
126+
127+
# BBB: usare parametri o adapter ?
128+
def rsync(container,
129+
remoteid,
130+
remoteurl=None,
131+
data=None,
132+
force_update=False,
133+
extractor=json_extractor,
134+
getter=obj_getter,
135+
creator=page_creator,
136+
updater=page_update,
137+
deleter=page_delete,
138+
verbose=False,
139+
**kwargs):
140+
"""
141+
* container: destination plone container
142+
* remoteid: pageid (destinaton pageid, i.e. remote uuid)
143+
* remoteurl:
144+
145+
# TODO: usare if-modified-since dove possibile
146+
# TODO: valutare eventualmente una funzione per definire l'id del contenuto locale
147+
"""
148+
if not remoteurl and not data:
149+
raise Exception('remoteurl or data required')
150+
obj = getter(container, remoteid)
151+
if remoteurl:
152+
response = http.get(remoteurl)
153+
else:
154+
response = data
155+
if obj:
156+
# update or delete
157+
if not response:
158+
# delete (se da 5XX non si cancella...)
159+
if response.status_code in ['401', '403', '404']:
160+
return deleter(obj)
161+
else:
162+
# TODO: sollevare un'eccezione quando c'è un errore in modo
163+
# che l'update venga fatto al sync sucessivo?
164+
logger.error('unable to fetch %s (%s)', remoteurl, response.status_code)
165+
return None
166+
else:
167+
# TODO: verifica sulle date di aggiornamento della pagina remota vs. locale
168+
data = extractor(container, response, **kwargs)
169+
if verbose:
170+
# TODO
171+
logger.warning('DEBUG: %s', data)
172+
if data:
173+
# default: se non ci sono dati di ultima modifica non si fanno
174+
# modifiche
175+
update = False
176+
if 'modification_date' in data:
177+
# BBB: le due date devono esistere ed essere entrambe DateTime
178+
update = (data['modification_date'] > obj.modification_date)
179+
if update or force_update:
180+
return updater(obj, data, **kwargs)
181+
else:
182+
# se la pagina remota non ha i metadati e come se fosse stata cancellate
183+
# quindi va cancllata anche quella locale
184+
return deleter(obj)
185+
return obj
186+
else:
187+
# create
188+
if not response:
189+
logger.error('unable to fetch %s (%s)', remoteurl, response.status_code)
190+
return None
191+
else:
192+
data = extractor(container, response, **kwargs)
193+
if data:
194+
obj = creator(container, data, id=remoteid, **kwargs)
195+
return obj
196+
197+
198+
"""
199+
# ESEMPIO: ALMA2021 vs. Magazine
200+
from unibo.api.rsync import rsync
201+
remoteurl = 'http://magazine.dev.dsaw.unibo.it/archivio/2018/mio-articolo-con-il-nuovo-font'
202+
remoteid = '6fc2a87d4aa64cc7ad6b5bd0838a4c0c' # AKA http://magazine.dev.dsaw.unibo.it/archivio/2018/mio-articolo-con-il-nuovo-font/uuid
203+
204+
def magazine_extractor(response, lang):
205+
data = extruct.extract(response.text)
206+
return data
207+
208+
container = api.content.get('/alma2021/it/notizie')
209+
obj_it = rsync(container, remoteid, remoteurl, extractor=magazine_extractor, lang='it')
210+
container = api.content.get('/alma2021/en/news')
211+
obj_en = rsync(container, remoteid, remoteurl, extractor=magazine_extractor, lang='en')
212+
"""

0 commit comments

Comments
 (0)