*RでTwitterからデータをとってくる
**参考　http://cran.r-project.org/web/packages/twitteR/vignettes/twitteR.pdf
**http://www.atmarkit.co.jp/fcoding/articles/stat/05/stat05a.html  その後Twitterの認証方法が変更になったため、使えない部分が多いので参考までに。

*まずインストール  　　　一度すればok
 install.packages('XML')


*処理方法
 library(XML)
 #********* Windowsか否かを判定して文字コード変換も自動で行うように改良
 #まず必要なサブルーチンを定義
 ##検索語と　取得ページ数を指定　多分最大15ページ(ページあたり100)
 #必要な情報を抽出してデータフレームに
  searchTweet2<-function(sword, npages){
 (a<-Sys.info())
 (fg.win<-length(grep("WIND.*",a,ignore.case =T))!=0)   #WinならばTRUEになる
 #サーチ語の文字コード変換
  #if(fg.win)  (sword<-iconv(sword,from="UTF-8",to="WINDOWS-932"))
  #if(fg.win)  (sword<-iconv(sword,to="UTF-8"))
     twitter_q <- URLencode(sword)      # search parameter
   mydata.xmls <- character(0)
 
  for (page in c(1:npages))
 { 
     # URL作成
     (twitter_url = paste('http://search.twitter.com/search.atom?q=',twitter_q,'&locale=ja&lang=ja&rpp=100&page=', page,  sep=''))
     mydata.xml <- xmlParseDoc(twitter_url, asText=F,encoding = "UTF-8")
 pdate <- xpathSApply(mydata.xml, '//s:entry/s:published', xmlValue, namespaces =c('s'='http://www.w3.org/2005/Atom'))
 
 if(length(pdate)==0){
    print(list(sword,"ヒットしませんでした"))
    pdate<-user_name<-user_rname<-user_url<-messages<-NULL
	}
 else{ 
 authors <- xpathSApply(mydata.xml, '//s:entry/s:author', xmlValue, namespaces =c('s'='http://www.w3.org/2005/Atom'))
 authors <- xpathSApply(mydata.xml, '//s:entry/s:author', xmlValue, namespaces =c('s'='http://www.w3.org/2005/Atom'))
 authors2<-lapply(authors,function(x) gsub("^\n","",x) )
 authors2<-lapply(authors,function(x) gsub("\n)",")",x) )  #ユーザー名の中に\nが使われている
 authors2<-lapply(authors2,function(x) gsub(" ","",x) )
 authors3<-lapply(authors2,function(x) unlist(strsplit(x,"\n")))
 authors4<-as.data.frame(authors3)
 
 usernam<-t(authors4[2,]);rownames(usernam)<-NULL
 #explorer_taku(TAKU)   基本はこのようになっているが
 #"nya_harinezumi(にゃ～(よばりん)@美菜子病)　　　のようにrユーザー名にも(があるのもいる
 
 user_name<-unlist(lapply(usernam,function(x) gsub('\\(.*\\)$',"",as.character(x))))  #ユーザー名
 user_rname<-substr(usernam,nchar(user_name)+2,nchar(usernam)-1)
 (user_url<-t(authors4[3,]));rownames(user_url)<-NULL   #URL
 
  messages <- xpathSApply(mydata.xml, '//s:entry/s:title', xmlValue, namespaces =c('s'='http://www.w3.org/2005/Atom'))
 messages<-sapply(messages,as.character)  
 
  #windows  932コードの場合､それに変換
  if(fg.win){
   (messages<-sapply(messages,function(x)  iconv(x,from="UTF-8",to="WINDOWS-932") ))
   (user_rname<-sapply(user_rname,function(x)  iconv(x,from="UTF-8",to="WINDOWS-932") ))
   }
  }
 dat<-data.frame(pdate, user_name, user_rname, user_url, messages)
 names(dat)<-c("pdate", "user_name", "user_rname", "user_url", "messages")
 if(page==1){dat0<-dat}
    else{
    dat0 <- rbind(dat0, dat)
   }
	mydata.xmls <- list(mydata.xmls, mydata.xml)
 }
  print(dim(dat0))
  print(names(dat0))
 return(list(dat0, mydata.xmls))
 }
 #ここまでをコピーペーストすると定義される。

*使い方
**検索語を"　　　"の中に"この例では　masason と　RT、　　取得ページ数　指定　多分最大15ページ(ページあたり100)
**Windowsの場合､日本語キーワードだとエラーになるようなので､ユーザー名など半角英数字で指定｡
***ユーザー名の切り出しがうまくいかずエラーになることもあるので､別の人などでトライしてみて下さい｡
***sdat<-searchTweet2("孫正義 RT",2)
***sdat<-searchTweet2("ファミリーマート",2)
***sdat<-searchTweet2("family_mart RT",2)
 sdat<-searchTweet2("masason RT",2)
 
 sText<-sdat[[1]]  #これがデータ
 sText[1:5,]   #1-5番目までをみてみる
 dim(sText)#いくつメッセージをとってこれたか
 names(sText)#含まれる変数名を表示
 #"pdate"      "user_name"  "user_rname" "user_url"   "messages"   日付､ユーザー名1　ユーザー名2　URL　メッセージ
 table(substr(sText$pdate,1,10))#日付毎に集計
 table(sText$user_rname)#user_rname毎に集計
 
 #保存
 save(sText,file="0sText.rda")

*おまけ　文字の内容を判定するためのルーチン(文字コードWIN変換はこの場合不要｡｡｡
 #---------------
 #dat　の中に　dicに含まれている文字列があるかを判定
 #datについてはWIN-932に変換されていることを前提
 #dicについては変換されていないので､ここで変換>この場合不要であった
 #---------------
 parseWords<-function(dat,dic){
 # (a<-Sys.info()) #文字コード変換
  #(fg.win<-length(grep("WIND.*",a,ignore.case =T))!=0)   #WinならばTRUEになる
  #if(fg.win){dic<-iconv(dic,from="UTF-8",to="WINDOWS-932")}
 
 fg.word<-rep(0, length(dat))
 no.mes<- grep(dic,dat,ignore.case = T)#dic内にある 単語を含むと　TRUE　そうでないとFALSEになる 
 fg.word[no.mes]<-1
 print(c("辞書は",dic))
 print(list("キーワードを含むメッセージ" ))
  print(dat[no.mes])
 print(c("ヒットしたメッセージの数",sum(fg.word)))   #TRUEになったメッセージをみてみる
 return(fg.word)  #フラグだけを返す
 }
*使い方
 #ポジティブ｡ネガティブと判定するためのキーワードを　|　でつないで定義　　これのいずれかを含むかを判定する
 sText$messages    #一度､ざっとメッセージをみて､重要そうなものをみつける
 dic.positive<-"いい|好き|すき|きれい|よい|おいしい"
  posflag<-parseWords(sText$messages,dic.positive)  #メッセージのうちキーワードが含まれていると1になるフラグ
  posflag

 dic.negative<-"だめ|嫌い|きらい|きたない|よくない|まずい|ひどい|恥ずかしい|やめろ"
  negaflag<-parseWords(sText$messages,dic.negative)
  negaflag
  #それぞれを　sTextに変数として入れる
  sText$posflag<-posflag
  sText$negaflag<-negaflag
  #含まれるものを表示する
  sText[sText$posflag==1,]
  sText[sText$negaflag==1,]
 #これらの結果をみながら　辞書内のキーワードが適切かを調整する
 #よい　だと　よいしょ　
 #いい　だと　　どうでもいい　も含まれる　など微妙な問題がある｡｡


*調査中
**messageから　@ユーザー名　部分をどうやって切り出すか?
 sText$messages[1:3]
 (RTs <-lapply(sText$messages[1:3],function(x) unlist(strsplit(as.character(x),"[@ :]"))))
 #(RTs <-lapply(sText$messages[1],function(x) unlist(sub("![@.* ]","",x))))
 #gregexpr("@.*[: ]",sText$messages[1])
 #gregexpr("@.*[: ]",sText$messages[1])

*参考)　文字コード変更コマンド  Windows用　　上のルーチンには入れてある
 sText$messages[1]
 iconv(sText$messages[1],from="UTF-8",to="WINDOWS-932") 
   (sText$mes2<-sapply(sText$messages,function(x)  iconv(x,from="UTF-8",to="WINDOWS-932") ))

*参考
 #現在の文字コードを調べる
 (syslocale0<-Sys.getlocale())
 #処理をして文字化けするようならば､ 下記にて文字コードを指定すると直るかも?  Windowsでは指定不能
 Sys.setlocale('LC_ALL','ja_JP.UTF-8')

*参考 認証しないとダウンロードできるメッセージ数に上限があるが､認証するとこれが大きくなるらしい｡
**そのためのパッケージが下記のよう｡誰か研究してみて下さい｡twitterアカウントを登録して､さらに開発者として登録｡
 install.packages("ROAuth")
 library(ROAuth)
 ??ROAuth

*参考)システム情報を取得　(Winならば文字コードを変換するようにしたい)
 (a<-Sys.info())
 (fg.win<-length(grep("WIND.*",a,ignore.case =T))!=0)   #WinならばTRUEになる

*参考)　下記のようなXMLでデータが帰ってくるので､それから必要な部分を切り出している｡
  <entry>
    <id>tag:search.twitter.com,2005:81013205966667776</id>
    <published>2011-06-15T15:00:28Z</published>
    <link type="text/html" href="http://twitter.com/02111973/statuses/81013205966667776" rel="alternate"/>
    <title>RT @rishikaw: RT @SoftBankCorp: 本日のエネルギーシフト勉強会における孫正義講演のアーカイブ映像はこちらからご覧ください。 http://ustre.am/:12AwV （iPhone http://ustre.am/:12AwY ） #energyjp</title>
    <content type="html">&lt;b&gt;RT&lt;/b&gt; &lt;a href="http://twitter.com/rishikaw"&gt;@rishikaw&lt;/a&gt;: &lt;b&gt;RT&lt;/b&gt; &lt;a href="http://twitter.com/SoftBankCorp"&gt;@SoftBankCorp&lt;/a&gt;: 本日のエネルギーシフト勉強会における孫正義講演のアーカイブ映像はこちらからご覧ください。 &lt;a href="http://ustre.am/:12AwV"&gt;http://ustre.am/:12AwV&lt;/a&gt; （iPhone &lt;a href="http://ustre.am/:12AwY"&gt;http://ustre.am/:12AwY&lt;/a&gt; ） &lt;a href="http://search.twitter.com/search?q=%23energyjp" onclick="pageTracker._setCustomVar(2, 'result_type', 'recent', 3);pageTracker._trackPageview('/intra/hashtag/#energyjp');"&gt;#energyjp&lt;/a&gt;</content>
    <updated>2011-06-15T15:00:28Z</updated>
    <link type="image/png" href="http://a2.twimg.com/profile_images/1277806313/52d05d41-fb47-4625-951b-7d7e82d776f2_normal.png" rel="image"/>
    <twitter:geo>
    </twitter:geo>
    <twitter:metadata>
      <twitter:result_type>recent</twitter:result_type>
    </twitter:metadata>
    <twitter:source>&lt;a href="http://twipple.jp/" rel="nofollow"&gt;ついっぷる for iPhone&lt;/a&gt;</twitter:source>
    <twitter:lang>ja</twitter:lang>
    <author>
      <name>02111973 (akaおんちゃん)</name>
      <uri>http://twitter.com/02111973</uri>
    </author>
  </entry>

*以下はふるいもの
 **練習　自分の興味のある人のユーザー名(titter)を入力して上記を実行してみる｡
*練習　　興味のある人のツイートをダウンロードしてみる     nも変更してみてどうなるか？　300や1000ぐらいにするとどうなるか？
*練習　　興味のある検索語を指定してダウンロードしてみる。検索語やnも変更してみてどうなるか？　300や1000ぐらいにするとどうなるか？
**検索語に　RT　を追加すると　その語を含むRTがとれるようなので、これをうまく使えば、アカウント間の関係を把握できそう。その他、ツイッターの検索のオプションを指定してみてください。
**参考）マニュアルによると下のように開始、終了日を指定できるはずだが、うまくいかない感じ。nやlangなどの与え方によってはとれるかもしれないので、いろいろトライしてみてください。
**全体的に今回のプロジェクトでこのデータを使えそうか否かを考察。

 sText$messages[1]<-"RT @softbank_ichiro: 岡田氏は、再生エネルギー法案をどう考えているのか！？単に政局のみに終始しては、日本は道を誤る！！QT: 政局：「再生エネ法も」に自民不信…退陣条件追加 - 毎日ｊｐ(毎日新聞) http://t.co/taCJRLj #genpatsu @masason"

 txt<-sText$messages[1]
 (a<-unlist(gregexpr("@",txt)))
 (b<-unlist(gregexpr("[ :]",txt)))
 retweeter<-character(0)
 for (i in seq(1,length(a))){
 	start<-a[i]
 	end<-min(b[b>start])
    (retw<-substr(txt,start,end-1))
   retweeter<-list(	retweeter, retw)
 }
 unlist(retweeter)